from typing import Dict, List, Optional, Tuple, Union # isort:skip
import pandas as pd
from tqdm.auto import tqdm
from catalyst.utils import args_are_not_none
from catalyst.utils.dataset import default_fold_split, stratified_fold_split
tqdm.pandas()
[docs]def dataframe_to_list(dataframe: pd.DataFrame) -> List[dict]:
"""
Converts dataframe to a list of rows (without indexes)
Args:
dataframe (DataFrame): input dataframe
Returns:
(List[dict]): list of rows
"""
result = list(dataframe.to_dict(orient="index").values())
return result
[docs]def folds_to_list(folds: Union[list, str, pd.Series]) -> List[int]:
"""
This function formats string or either list of numbers
into a list of unique int
Args:
folds (Union[list, str, pd.Series]): Either list of numbers or
one string with numbers separated by commas or
pandas series
Returns:
List[int]: list of unique ints
Examples:
>>> folds_to_list("1,2,1,3,4,2,4,6")
[1, 2, 3, 4, 6]
>>> folds_to_list([1, 2, 3.0, 5])
[1, 2, 3, 5]
Raises:
ValueError: if value in string or array cannot be casted to int
"""
if isinstance(folds, str):
folds = folds.split(",")
elif isinstance(folds, pd.Series):
folds = list(sorted(folds.unique()))
return list(sorted(list({int(x) for x in folds})))
[docs]def map_dataframe(
dataframe: pd.DataFrame,
tag_column: str,
class_column: str,
tag2class: Dict[str, int],
verbose: bool = False
) -> pd.DataFrame:
"""
This function maps tags from ``tag_column`` to ints into ``class_column``
Using ``tag2class`` dictionary
Args:
dataframe (pd.DataFrame): input dataframe
tag_column (str): column with tags
class_column (str) output column with classes
tag2class (Dict[str, int]): mapping from tags to class labels
verbose: flag if true, uses tqdm
Returns:
pd.DataFrame: updated dataframe with ``class_column``
"""
dataframe: pd.DataFrame = dataframe.copy()
def map_label(x):
return tag2class[str(x)]
if verbose:
series: pd.Series = dataframe[tag_column].progress_apply(map_label)
else:
series: pd.Series = dataframe[tag_column].apply(map_label)
dataframe.loc[series.index, class_column] = series
return dataframe
[docs]def split_dataframe(
dataframe: pd.DataFrame,
train_folds: List[int],
valid_folds: Optional[List[int]] = None,
infer_folds: Optional[List[int]] = None,
tag2class: Optional[Dict[str, int]] = None,
tag_column: str = None,
class_column: str = None,
seed: int = 42,
n_folds: int = 5
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Split a Pandas DataFrame into folds.
Args:
dataframe (pd.DataFrame): input dataframe
train_folds (List[int]): train folds
valid_folds (List[int], optional): valid folds.
If none takes all folds not included in ``train_folds``
infer_folds (List[int], optional): infer folds.
If none takes all folds not included in ``train_folds``
and ``valid_folds``
tag2class (Dict[str, int], optional): mapping from label names into int
tag_column (str, optional): column with label names
class_column (str, optional): column to use for split
seed (int): seed for split
n_folds (int): number of folds
Returns:
(tuple): tuple with 4 dataframes
whole dataframe, train part, valid part and infer part
"""
if args_are_not_none(tag2class, tag_column, class_column):
dataframe = map_dataframe(
dataframe, tag_column, class_column, tag2class
)
if class_column is not None:
result_dataframe = stratified_fold_split(
dataframe,
class_column=class_column,
random_state=seed,
n_folds=n_folds
)
else:
result_dataframe = default_fold_split(
dataframe, random_state=seed, n_folds=n_folds
)
fold_series = result_dataframe["fold"]
train_folds = folds_to_list(train_folds)
df_train = result_dataframe[fold_series.isin(train_folds)]
if valid_folds is None:
mask = ~fold_series.isin(train_folds)
valid_folds = result_dataframe[mask]["fold"]
valid_folds = folds_to_list(valid_folds)
df_valid = result_dataframe[fold_series.isin(valid_folds)]
infer_folds = folds_to_list(infer_folds or [])
df_infer = result_dataframe[fold_series.isin(infer_folds)]
return result_dataframe, df_train, df_valid, df_infer
[docs]def merge_multiple_fold_csv(
fold_name: str, paths: Optional[str]
) -> pd.DataFrame:
"""
Reads csv into one DataFrame with column ``fold``
Args:
fold_name (str): current fold name
paths (str): paths to csv separated by commas
Returns:
pd.DataFrame: merged dataframes with column ``fold`` == ``fold_name``
"""
result = pd.DataFrame()
if paths is not None:
for csv_path in paths.split(","):
dataframe = pd.read_csv(csv_path)
dataframe["fold"] = fold_name
result = result.append(dataframe, ignore_index=True)
return result
[docs]def read_multiple_dataframes(
in_csv_train: str = None,
in_csv_valid: str = None,
in_csv_infer: str = None,
tag2class: Optional[Dict[str, int]] = None,
class_column: str = None,
tag_column: str = None
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""This function reads train/valid/infer dataframes from giving paths
Args:
in_csv_train (str): paths to train csv separated by commas
in_csv_valid (str): paths to valid csv separated by commas
in_csv_infer (str): paths to infer csv separated by commas
tag2class (Dict[str, int], optional): mapping from label names into int
tag_column (str, optional): column with label names
class_column (str, optional): column to use for split
Returns:
(tuple): tuple with 4 dataframes
whole dataframe, train part, valid part and infer part
"""
assert any(
[x is not None for x in (in_csv_train, in_csv_valid, in_csv_infer)]
)
result_df = None
fold_dfs = {}
for fold_df, fold_name in zip(
(in_csv_train, in_csv_valid, in_csv_infer),
("train", "valid", "infer")
):
if fold_df is not None:
fold_df = merge_multiple_fold_csv(
fold_name=fold_name, paths=fold_df
)
if args_are_not_none(tag2class, tag_column, class_column):
fold_df = map_dataframe(
fold_df, tag_column, class_column, tag2class
)
fold_dfs[fold_name] = fold_df
result_df = fold_df \
if result_df is None \
else result_df.append(fold_df, ignore_index=True)
output = (
result_df,
fold_dfs.get("train", None),
fold_dfs.get("valid", None),
fold_dfs.get("infer", None),
)
return output
[docs]def read_csv_data(
in_csv: str = None,
train_folds: Optional[List[int]] = None,
valid_folds: Optional[List[int]] = None,
infer_folds: Optional[List[int]] = None,
seed: int = 42,
n_folds: int = 5,
in_csv_train: str = None,
in_csv_valid: str = None,
in_csv_infer: str = None,
tag2class: Optional[Dict[str, int]] = None,
class_column: str = None,
tag_column: str = None,
) -> Tuple[pd.DataFrame, List[dict], List[dict], List[dict]]:
"""
From giving path ``in_csv`` reads a dataframe
and split it to train/valid/infer folds
or from several paths ``in_csv_train``, ``in_csv_valid``, ``in_csv_infer``
reads independent folds.
Note:
This function can be used with different combinations of params.
First block is used to get dataset from one `csv`:
in_csv, train_folds, valid_folds, infer_folds, seed, n_folds
Second includes paths to different csv for train/valid and infer parts:
in_csv_train, in_csv_valid, in_csv_infer
The other params (tag2class, tag_column, class_column) are optional
for any previous block
Args:
in_csv (str): paths to whole dataset
train_folds (List[int]): train folds
valid_folds (List[int], optional): valid folds.
If none takes all folds not included in ``train_folds``
infer_folds (List[int], optional): infer folds.
If none takes all folds not included in ``train_folds``
and ``valid_folds``
seed (int): seed for split
n_folds (int): number of folds
in_csv_train (str): paths to train csv separated by commas
in_csv_valid (str): paths to valid csv separated by commas
in_csv_infer (str): paths to infer csv separated by commas
tag2class (Dict[str, int]): mapping from label names into ints
tag_column (str): column with label names
class_column (str): column to use for split
Returns:
(Tuple[pd.DataFrame, List[dict], List[dict], List[dict]]):
tuple with 4 elements
(whole dataframe,
list with train data,
list with valid data
and list with infer data)
"""
from_one_df: bool = in_csv is not None
from_multiple_df: bool = \
in_csv_train is not None \
or in_csv_valid is not None \
or in_csv_infer is not None
if from_one_df == from_multiple_df:
raise ValueError(
"You should pass `in_csv` "
"or `in_csv_train` with `in_csv_valid` but not both!"
)
if from_one_df:
dataframe: pd.DataFrame = pd.read_csv(in_csv)
dataframe, df_train, df_valid, df_infer = split_dataframe(
dataframe,
train_folds=train_folds,
valid_folds=valid_folds,
infer_folds=infer_folds,
tag2class=tag2class,
class_column=class_column,
tag_column=tag_column,
seed=seed,
n_folds=n_folds
)
else:
dataframe, df_train, df_valid, df_infer = read_multiple_dataframes(
in_csv_train=in_csv_train,
in_csv_valid=in_csv_valid,
in_csv_infer=in_csv_infer,
tag2class=tag2class,
class_column=class_column,
tag_column=tag_column
)
for data in [df_train, df_valid, df_infer]:
if data is not None and "fold" in data.columns:
del data["fold"]
result = (
dataframe,
dataframe_to_list(df_train) if df_train is not None else None,
dataframe_to_list(df_valid) if df_valid is not None else None,
dataframe_to_list(df_infer) if df_infer is not None else None,
)
return result