Source code for catalyst.data.__main__
# -*- coding: utf-8 -*-
r"""Catalyst-data scripts.
Examples:
1. **process-images** reads raw data and outputs
preprocessed resized images
.. code:: bash
$ catalyst-data process-images \\
--in-dir /path/to/raw/data/ \\
--out-dir=./data/dataset \\
--num-workers=6 \\
--max-size=224 \\
--extension=png \\
--clear-exif \\
--grayscale \\
--expand-dims
2. **tag2label** prepares a dataset to json like
`{"class_id": class_column_from_dataset}`
.. code:: bash
$ catalyst-data tag2label \\
--in-dir=./data/dataset \\
--out-dataset=./data/dataset_raw.csv \\
--out-labeling=./data/tag2cls.json
3. **check-images** checks images in your data
to be non-broken and writes a flag:
`true` if image opened without an error and `false` otherwise
.. code:: bash
$ catalyst-data check-images \\
--in-csv=./data/dataset_raw.csv \\
--img-rootpath=./data/dataset \\
--img-col="tag" \\
--out-csv=./data/dataset_checked.csv \\
--n-cpu=4
4. **split-dataframe** split your dataset into train/valid folds
.. code:: bash
$ catalyst-data split-dataframe \\
--in-csv=./data/dataset_raw.csv \\
--tag2class=./data/tag2cls.json \\
--tag-column=tag \\
--class-column=class \\
--n-folds=5 \\
--train-folds=0,1,2,3 \\
--out-csv=./data/dataset.csv
5. **image2embedding** embeds images from your csv
or image directory with specified neural net architecture
.. code:: bash
$ catalyst-data image2embedding \\
--in-csv=./data/input.csv \\
--img-col="filename" \\
--img-size=64 \\
--out-npy=./embeddings.npy \\
--arch=resnet34 \\
--pooling=GlobalMaxPool2d \\
--batch-size=8 \\
--num-workers=16 \\
--verbose
"""
from argparse import ArgumentParser, RawTextHelpFormatter
from collections import OrderedDict
import logging
import os
from catalyst.__version__ import __version__
from catalyst.data.scripts import (
image2embedding,
process_images,
project_embeddings,
split_dataframe,
tag2label,
)
logger = logging.getLogger(__name__)
COMMANDS = OrderedDict(
[
("tag2label", tag2label),
("process-images", process_images),
("split-dataframe", split_dataframe),
("image2embedding", image2embedding),
("project-embeddings", project_embeddings),
]
)
try:
import transformers # noqa: F401
from catalyst.data.scripts import text2embedding
COMMANDS["text2embedding"] = text2embedding
except ImportError as ex:
if os.environ.get("USE_TRANSFORMERS", "0") == "1":
logger.warning(
"transformers not available, to install transformers,"
" run `pip install transformers`."
)
raise ex
[docs]def build_parser() -> ArgumentParser:
"""
@TODO: Docs. Contribution is welcome
"""
parser = ArgumentParser(
"catalyst-data", formatter_class=RawTextHelpFormatter
)
parser.add_argument(
"-v", "--version", action="version", version=f"%(prog)s {__version__}"
)
all_commands = ", \n".join(map(lambda x: f" {x}", COMMANDS.keys()))
subparsers = parser.add_subparsers(
metavar="{command}",
dest="command",
help=f"available commands: \n{all_commands}",
)
subparsers.required = True
for key, value in COMMANDS.items():
value.build_args(subparsers.add_parser(key))
return parser
[docs]def main():
"""
@TODO: Docs. Contribution is welcome
"""
parser = build_parser()
args, uargs = parser.parse_known_args()
COMMANDS[args.command].main(args, uargs)
if __name__ == "__main__":
main()