Source code for order.dataset

# coding: utf-8

"""
Classes to define datasets.
"""


__all__ = ["Dataset", "DatasetInfo"]


import six

from order.unique import UniqueObject, UniqueObjectIndex, unique_tree
from order.mixins import CopyMixin, AuxDataMixin, TagMixin, DataSourceMixin, LabelMixin
from order.process import Process
from order.shift import Shift
from order.util import typed, make_list


[docs]@unique_tree(cls=Process, parents=False, deep_children=True) class Dataset(UniqueObject, CopyMixin, AuxDataMixin, TagMixin, DataSourceMixin, LabelMixin): """ Dataset definition providing two kinds of information: 1. (systematic) shift-dependent, and 2. shift-indepent information. Independent is e.g. whether or not it contains real data, whereas shift-dependent information is e.g. the number of events in the *nominal* or a *shifted* variation. Latter information is contained in :py:class:`DatasetInfo` objects that are stored in *this* class and mapped to strings. These info objects can be accessed via :py:meth:`get_info` or via items (*__getitem__*). For convenience, some of the properties of the *nominal* :py:class:`DatasetInfo` object are accessible on this class via forwarding. **Arguments** A dataset is always measured in (real data) / created for (MC) a dedicated *campaign*, therefore it *belongs* to a :py:class:`~order.config.Campaign` object. In addition, physics *processes* can be *linked* to a dataset, therefore it *has* :py:class:`~order.process.Process` objects. When *info* is does not contain a nominal :py:class:`DatasetInfo` object (mapped to the key :py:attr:`order.shift.Shift.NOMINAL`, i.e., ``"nominal"``), all *kwargs* are used to create one. Otherwise, it should be a dictionary matching the format of the *info* mapping. *label* and *label_short* are forwarded to the :py:class:`~order.mixins.LabelMixin`, *is_data* to the :py:class:`~order.mixins.DataSourceMixin`, *tags* to the :py:class:`~order.mixins.TagMixin`, *aux* to the :py:class:`~order.mixins.AuxDataMixin`, and *name* and *id* to the :py:class:`~order.unique.UniqueObject` constructor. **Copy behavior** ``copy()`` The :py:attr:`campaign` attribute is carried over as a reference, all remaining attributes are copied. Note that the copied dataset is also registered in the campaign. ``copy_shallow()`` All attributs are copied except for the :py:attr:`campaign` and containd :py:attr:`processes` which are set to default values instead. **Example** .. code-block:: python import order as od campaign = od.Campaign("2017B", 1, ...) d = od.Dataset( name="ttH_bb", id=1, campaign=campaign, keys=["/ttHTobb_M125.../.../..."], n_files=123, n_events=456789, gen_order="nlo", ) d.info.keys() # -> ["nominal"] d["nominal"].n_files # -> 123 d.n_files # -> 123 # similar to above, but set explicit info objects d = Dataset( name="ttH_bb", id=1, campaign=campaign, info={ "nominal": { "keys": ["/ttHTobb_M125.../.../..."], "n_files": 123, "n_events": 456789, "gen_order": "nlo", }, "scale_up": { "keys": ["/ttHTobb_M125_scaleUP.../.../..."], "n_files": 100, "n_events": 40000, "gen_order": "nlo", }, }, ) d.info.keys() # -> ["nominal", "scale_up"] d["nominal"].n_files # -> 123 d.n_files # -> 123 d["scale_up"].n_files # -> 100 **Members** .. py:attribute:: campaign type: :py:class:`Campaign`, None The :py:class:`~order.config.Campaign` object this dataset belongs to. When set, *this* dataset is also added to the dataset index of the campaign object. .. py:attribute:: info type: dictionary Mapping of shift names to :py:class:`DatasetInfo` instances. .. py:attribute:: keys type: list (read-only) The dataset keys of the nominal :py:class:`DatasetInfo` object. .. py:attribute:: n_files type: integer (read-only) The number of files of the nominal :py:class:`DatasetInfo` object. .. py:attribute:: n_events type: integer (read-only) The number of events of the nominal :py:class:`DatasetInfo` object. .. py:attribute:: gen_order type: string (read-only) The generator perturbation order of the nominal :py:class:`DatasetInfo` object. """ cls_name_singular = "dataset" cls_name_plural = "datasets" # attributes for copying copy_specs = ( [ { "attr": "_campaign", "ref": True, "skip_shallow": True, }, { "attr": "_processes", "skip_shallow": True, "skip_value": CopyMixin.Deferred(lambda inst: UniqueObjectIndex(cls=Process)), }, ] + UniqueObject.copy_specs + AuxDataMixin.copy_specs + TagMixin.copy_specs + DataSourceMixin.copy_specs + LabelMixin.copy_specs ) def __init__( self, name, id, campaign=None, info=None, processes=None, label=None, label_short=None, is_data=False, tags=None, aux=None, **kwargs # noqa: C816 ): UniqueObject.__init__(self, name, id) CopyMixin.__init__(self) AuxDataMixin.__init__(self, aux=aux) TagMixin.__init__(self, tags=tags) DataSourceMixin.__init__(self, is_data=is_data) LabelMixin.__init__(self, label=label, label_short=label_short) # instance members self._campaign = None self._info = {} # set initial values if campaign is not None: self.campaign = campaign if info is not None: self.info = info if kwargs and Shift.NOMINAL not in self.info: self.info = {Shift.NOMINAL: kwargs} # set initial processes if processes is not None: self.extend_processes(processes) def __getitem__(self, name): """ Forwarded to :py:meth:`get_info`. """ return self.get_info(name)
[docs] def copy(self, *args, **kwargs): inst = super(Dataset, self).copy(*args, **kwargs) # register in the campaign if inst.campaign: inst.campaign.datasets.add(inst) return inst
@property def campaign(self): # campaign getter return self._campaign @campaign.setter def campaign(self, campaign): # campaign setter if campaign is not None and not isinstance(campaign, Campaign): raise TypeError("invalid campaign type: {}".format(campaign)) # remove this dataset from the current campaign's dataset index if self._campaign: self._campaign.datasets.remove(self) # add this dataset to the campaign's dataset index if campaign: campaign.datasets.add(self) self._campaign = campaign @typed def info(self, info): # info parser try: info = dict(info) except: raise TypeError("invalid info type: {}".format(info)) _info = {} for name, obj in info.items(): if not isinstance(name, six.string_types): raise TypeError("invalid info name type: {}".format(name)) if not isinstance(obj, DatasetInfo): if not isinstance(obj, dict): try: obj = dict(obj) except: raise TypeError("invalid info value type: {}".format(obj)) obj = DatasetInfo(**obj) _info[str(name)] = obj return _info
[docs] def set_info(self, name, info): """ Sets an :py:class:`DatasetInfo` object *info* for a given *name*. Returns the object. """ name, info = list(self.__class__.info.fparse(self, {name: info}).items())[0] self.info[name] = info return info
[docs] def get_info(self, name): """ Returns the :py:class:`DatasetInfo` object for a given *name*. """ return self.info[name]
@property def keys(self): # keys getter, nominal info object return self.info[Shift.NOMINAL].keys @property def n_files(self): # n_files getter, nominal info object return self.info[Shift.NOMINAL].n_files @property def n_events(self): # n_events getter, nominal info object return self.info[Shift.NOMINAL].n_events @property def gen_order(self): # gen_order getter, nominal info object return self.info[Shift.NOMINAL].gen_order
[docs]class DatasetInfo(CopyMixin, AuxDataMixin, TagMixin): """ Container class holding information on particular dataset variations. Instances of *this* class are typically used in :py:class:`Dataset` objects to store shift-dependent information, such as the number of files or events for a particular shift (e.g. *nominal*, *scale_up*, etc). **Arguments** *keys* denote the identifiers or *origins* of a dataset. *n_files* and *n_events* can be used for further bookkeeping. *tags* are forwarded to the :py:class:`~order.mixins.TagMixin`, and *aux* to the :py:class:`~order.mixins.AuxDataMixin`. **Copy behavior** All attributes are copied. **Members** .. py:attribute:: keys type: list The dataset keys, e.g. ``["/ttHTobb_M125.../.../..."]``. .. py:attribute:: n_files type: integer The number of files. .. py:attribute:: n_events type: integer The number of events. .. py:attribute:: gen_order type: string The generator perturbation order. """ copy_specs = ( AuxDataMixin.copy_specs + TagMixin.copy_specs ) def __init__(self, keys=None, n_files=-1, n_events=-1, gen_order=None, tags=None, aux=None): CopyMixin.__init__(self) AuxDataMixin.__init__(self, aux=aux) TagMixin.__init__(self, tags=tags) # instance members self._keys = [] self._n_files = -1 self._n_events = -1 # set initial values if keys is not None: self.keys = keys if n_files is not None: self.n_files = n_files if n_events is not None: self.n_events = n_events if gen_order is not None: self.gen_order = gen_order @typed def keys(self, keys): # keys parser _keys = [] for key in make_list(keys): if not isinstance(key, six.string_types): raise TypeError("invalid key type: {}".format(key)) _keys.append(str(key)) return _keys @typed def n_files(self, n_files): # n_files parser if isinstance(n_files, float): n_files = int(n_files) elif not isinstance(n_files, six.integer_types): raise TypeError("invalid n_files type: {}".format(n_files)) return n_files @typed def n_events(self, n_events): # n_events parser if isinstance(n_events, float): n_events = int(n_events) elif not isinstance(n_events, six.integer_types): raise TypeError("invalid n_events type: {}".format(n_events)) return n_events @typed def gen_order(self, gen_order): # gen_order parser if not isinstance(gen_order, six.string_types): raise TypeError("invalid gen_order type: {}".format(gen_order)) return gen_order
# prevent circular imports from order.config import Campaign