Source code for catalyst.contrib.utils.nlp.text
from typing import Dict, List, Union
import string
import numpy as np
import torch
from catalyst.contrib.nn.modules import LamaPooling
[docs]def tokenize_text(
text: str,
tokenizer, # HuggingFace tokenizer, ex: BertTokenizer
max_length: int,
strip: bool = True,
lowercase: bool = True,
remove_punctuation: bool = True,
) -> Dict[str, np.array]:
"""Tokenizes givin text.
Args:
text: text to tokenize
tokenizer: Tokenizer instance from HuggingFace
max_length: maximum length of tokens
strip: if true strips text before tokenizing
lowercase: if true makes text lowercase before tokenizing
remove_punctuation: if true
removes ``string.punctuation`` from text before tokenizing
Returns:
batch with tokenized text
"""
if strip:
text = text.strip()
if lowercase:
text = text.lower()
if remove_punctuation:
text = text.translate(str.maketrans("", "", string.punctuation))
text = text.replace(r"\s", " ").replace(r"\s\s+", " ").strip()
inputs = tokenizer.encode_plus(
text, "", add_special_tokens=True, max_length=max_length
)
input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
attention_mask = [1] * len(input_ids)
padding_length = max_length - len(input_ids)
input_ids = input_ids + ([0] * padding_length)
attention_mask = attention_mask + ([0] * padding_length)
token_type_ids = token_type_ids + ([0] * padding_length)
return {
"input_ids": np.array(input_ids, dtype=np.int64),
"token_type_ids": np.array(token_type_ids, dtype=np.int64),
"attention_mask": np.array(attention_mask, dtype=np.int64),
}
[docs]def process_bert_output(
bert_output,
hidden_size: int,
output_hidden_states: bool = False,
pooling_groups: List[str] = None,
mask: torch.Tensor = None,
level: Union[int, str] = None,
):
"""Processed BERT output.
Args:
bert_output: BERT output
hidden_size: hidden size of BERT layers
output_hidden_states: boolean flag if we need BERT hidden states
pooling_groups: list with pooling to use for sequence embedding
mask: boolean flag if we need mask ``[PAD]`` tokens
level: integer with specified level to use
Returns:
processed output
"""
pooling = (
LamaPooling(groups=pooling_groups, in_features=hidden_size)
if pooling_groups is not None
else None
)
def _process_features(features):
if pooling is not None:
features = pooling(features, mask=mask)
return features
if isinstance(level, str):
assert level in ("pooling", "class")
if level == "pooling":
return _process_features(bert_output[0])
else:
return bert_output[1]
elif isinstance(level, int):
return _process_features(bert_output[2][level])
output = {
"pooling": _process_features(bert_output[0]),
"class": bert_output[1],
}
if output_hidden_states:
for i, feature in enumerate(bert_output[2]):
output[i] = _process_features(feature)
return output
__all__ = ["tokenize_text", "process_bert_output"]