Source code for catalyst.contrib.utils.nlp.text

from typing import Dict, List, Union
import string

import numpy as np

import torch

from catalyst.contrib.nn.modules import LamaPooling

[docs]def tokenize_text( text: str, tokenizer, # HuggingFace tokenizer, ex: BertTokenizer max_length: int, strip: bool = True, lowercase: bool = True, remove_punctuation: bool = True, ) -> Dict[str, np.array]: """Tokenizes givin text. Args: text: text to tokenize tokenizer: Tokenizer instance from HuggingFace max_length: maximum length of tokens strip: if true strips text before tokenizing lowercase: if true makes text lowercase before tokenizing remove_punctuation: if true removes ``string.punctuation`` from text before tokenizing Returns: batch with tokenized text """ if strip: text = text.strip() if lowercase: text = text.lower() if remove_punctuation: text = text.translate(str.maketrans("", "", string.punctuation)) text = text.replace(r"\s", " ").replace(r"\s\s+", " ").strip() inputs = tokenizer.encode_plus( text, "", add_special_tokens=True, max_length=max_length ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] attention_mask = [1] * len(input_ids) padding_length = max_length - len(input_ids) input_ids = input_ids + ([0] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([0] * padding_length) return { "input_ids": np.array(input_ids, dtype=np.int64), "token_type_ids": np.array(token_type_ids, dtype=np.int64), "attention_mask": np.array(attention_mask, dtype=np.int64), }
[docs]def process_bert_output( bert_output, hidden_size: int, output_hidden_states: bool = False, pooling_groups: List[str] = None, mask: torch.Tensor = None, level: Union[int, str] = None, ): """Processed BERT output. Args: bert_output: BERT output hidden_size: hidden size of BERT layers output_hidden_states: boolean flag if we need BERT hidden states pooling_groups: list with pooling to use for sequence embedding mask: boolean flag if we need mask ``[PAD]`` tokens level: integer with specified level to use Returns: processed output """ pooling = ( LamaPooling(groups=pooling_groups, in_features=hidden_size) if pooling_groups is not None else None ) def _process_features(features): if pooling is not None: features = pooling(features, mask=mask) return features if isinstance(level, str): assert level in ("pooling", "class") if level == "pooling": return _process_features(bert_output[0]) else: return bert_output[1] elif isinstance(level, int): return _process_features(bert_output[2][level]) output = { "pooling": _process_features(bert_output[0]), "class": bert_output[1], } if output_hidden_states: for i, feature in enumerate(bert_output[2]): output[i] = _process_features(feature) return output
__all__ = ["tokenize_text", "process_bert_output"]