Source code for catalyst.data.dataset
from typing import Any, Callable, Dict, List, Union
from pathlib import Path
import numpy as np
from torch.utils.data import Dataset, Sampler
from catalyst.utils import merge_dicts
_Path = Union[str, Path]
[docs]class ListDataset(Dataset):
"""General purpose dataset class with several data sources `list_data`."""
[docs] def __init__(
self,
list_data: List[Dict],
open_fn: Callable,
dict_transform: Callable = None,
):
"""
Args:
list_data (List[Dict]): list of dicts, that stores
you data annotations,
(for example path to images, labels, bboxes, etc.)
open_fn (callable): function, that can open your
annotations dict and
transfer it to data, needed by your network
(for example open image by path, or tokenize read string.)
dict_transform (callable): transforms to use on dict.
(for example normalize image, add blur, crop/resize/etc)
"""
self.data = list_data
self.open_fn = open_fn
self.dict_transform = (
dict_transform if dict_transform is not None else lambda x: x
)
[docs] def __getitem__(self, index: int) -> Any:
"""Gets element of the dataset.
Args:
index (int): index of the element in the dataset
Returns:
Single element by index
"""
item = self.data[index]
dict_ = self.open_fn(item)
dict_ = self.dict_transform(dict_)
return dict_
[docs] def __len__(self) -> int:
"""
Returns:
int: length of the dataset
"""
return len(self.data)
[docs]class MergeDataset(Dataset):
"""Abstraction to merge several datasets into one dataset."""
[docs] def __init__(self, *datasets: Dataset, dict_transform: Callable = None):
"""
Args:
datasets (List[Dataset]): params count of datasets to merge
dict_transform (callable): transforms common for all datasets.
(for example normalize image, add blur, crop/resize/etc)
"""
self.length = len(datasets[0])
assert all(len(x) == self.length for x in datasets)
self.datasets = datasets
self.dict_transform = dict_transform
[docs] def __getitem__(self, index: int) -> Any:
"""Get item from all datasets.
Args:
index (int): index to value from all datasets
Returns:
list: list of value in every dataset
"""
dcts = [x[index] for x in self.datasets]
dct = merge_dicts(*dcts)
if self.dict_transform is not None:
dct = self.dict_transform(dct)
return dct
[docs] def __len__(self) -> int:
"""
Returns:
int: length of the dataset
"""
return self.length
[docs]class NumpyDataset(Dataset):
"""General purpose dataset class to use with `numpy_data`."""
[docs] def __init__(
self,
numpy_data: np.ndarray,
numpy_key: str = "features",
dict_transform: Callable = None,
):
"""
General purpose dataset class to use with `numpy_data`.
Args:
numpy_data (np.ndarray): numpy data
(for example path to embeddings, features, etc.)
numpy_key (str): key to use for output dictionary
dict_transform (callable): transforms to use on dict.
(for example normalize vector, etc)
"""
super().__init__()
self.data = numpy_data
self.key = numpy_key
self.dict_transform = (
dict_transform if dict_transform is not None else lambda x: x
)
[docs] def __getitem__(self, index: int) -> Any:
"""Gets element of the dataset.
Args:
index (int): index of the element in the dataset
Returns:
Single element by index
"""
dict_ = {self.key: np.copy(self.data[index])}
dict_ = self.dict_transform(dict_)
return dict_
[docs] def __len__(self) -> int:
"""
Returns:
int: length of the dataset
"""
return len(self.data)
[docs]class PathsDataset(ListDataset):
"""
Dataset that derives features and targets from samples filesystem paths.
Examples:
>>> label_fn = lambda x: x.split("_")[0]
>>> dataset = PathsDataset(
>>> filenames=Path("/path/to/images/").glob("*.jpg"),
>>> label_fn=label_fn,
>>> open_fn=open_fn,
>>> )
"""
[docs] def __init__(
self,
filenames: List[_Path],
open_fn: Callable[[dict], dict],
label_fn: Callable[[_Path], Any],
**list_dataset_params
):
"""
Args:
filenames (List[str]): list of file paths that store information
about your dataset samples; it could be images, texts or
any other files in general.
open_fn (callable): function, that can open your
annotations dict and
transfer it to data, needed by your network
(for example open image by path, or tokenize read string)
label_fn (callable): function, that can extract target
value from sample path
(for example, your sample could be an image file like
``/path/to/your/image_1.png`` where the target is encoded as
a part of file path)
list_dataset_params (dict): base class initialization
parameters.
"""
list_data = [
{"features": filename, "targets": label_fn(filename)}
for filename in filenames
]
super().__init__(
list_data=list_data, open_fn=open_fn, **list_dataset_params
)
[docs]class DatasetFromSampler(Dataset):
"""Dataset of indexes from `Sampler`."""
[docs] def __init__(self, sampler: Sampler):
"""
Args:
sampler (Sampler): @TODO: Docs. Contribution is welcome
"""
self.sampler = sampler
self.sampler_list = None
[docs] def __getitem__(self, index: int):
"""Gets element of the dataset.
Args:
index (int): index of the element in the dataset
Returns:
Single element by index
"""
if self.sampler_list is None:
self.sampler_list = list(self.sampler)
return self.sampler_list[index]
[docs] def __len__(self) -> int:
"""
Returns:
int: length of the dataset
"""
return len(self.sampler)
__all__ = [
"ListDataset",
"MergeDataset",
"NumpyDataset",
"PathsDataset",
"DatasetFromSampler",
]