Skip to content

Commit a10aa45

Browse files
agunapalmreso
andauthored
Make a copy of the torchtext utils to remove dependency (#3076)
* Make a copy of the torchtext utils to remove dependency * Make a copy of the torchtext utils to remove dependency * missing import * Updated scriptable tokenizer example * Updated scriptable tokenizer example * added missing function --------- Co-authored-by: Matthias Reso <[email protected]>
1 parent dd1f8dc commit a10aa45

File tree

6 files changed

+138
-2
lines changed

6 files changed

+138
-2
lines changed

examples/text_classification_with_scriptable_tokenizer/README.md

+3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Text Classification using a Scriptable Tokenizer
22

3+
## Deprecation Warning!
4+
This example requires TorchText which is deprecated. Please use version <= 0.11.0 of TorchServe for this example
5+
36
TorchScript is a way to serialize and optimize your PyTorch models.
47
A scriptable tokenizer is a special tokenizer which is compatible with [TorchScript's compiler](https://pytorch.org/docs/stable/jit.html) so that it can be jointly serialized with a PyTorch model.
58
When deploying an NLP model it is important to use the same tokenizer during training and inference to achieve the same model accuracy in both phases of the model live cycle.

test/pytest/conftest.py

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
collect_ignore.append("test_example_torchrec_dlrm.py")
1515
collect_ignore.append("test_example_near_real_time_video.py")
1616
collect_ignore.append("test_dali_preprocess.py")
17+
collect_ignore.append("test_example_scriptable_tokenzier.py")
1718

1819

1920
@pytest.fixture(scope="module")

ts/handler_utils/text_utils.py

+129
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""
2+
Functions which have been copied from TorchText to remove TorchServe's
3+
dependency on TorchText
4+
5+
from torchtext.data.utils import ngrams_iterator, get_tokenizer
6+
7+
"""
8+
9+
import re
10+
11+
12+
def ngrams_iterator(token_list, ngrams):
13+
"""Return an iterator that yields the given tokens and their ngrams.
14+
15+
Args:
16+
token_list: A list of tokens
17+
ngrams: the number of ngrams.
18+
19+
Examples:
20+
>>> token_list = ['here', 'we', 'are']
21+
>>> list(ngrams_iterator(token_list, 2))
22+
>>> ['here', 'here we', 'we', 'we are', 'are']
23+
"""
24+
25+
def _get_ngrams(n):
26+
return zip(*[token_list[i:] for i in range(n)])
27+
28+
for x in token_list:
29+
yield x
30+
for n in range(2, ngrams + 1):
31+
for x in _get_ngrams(n):
32+
yield " ".join(x)
33+
34+
35+
_patterns = [
36+
r"\'",
37+
r"\"",
38+
r"\.",
39+
r"<br \/>",
40+
r",",
41+
r"\(",
42+
r"\)",
43+
r"\!",
44+
r"\?",
45+
r"\;",
46+
r"\:",
47+
r"\s+",
48+
]
49+
50+
_replacements = [
51+
" ' ",
52+
"",
53+
" . ",
54+
" ",
55+
" , ",
56+
" ( ",
57+
" ) ",
58+
" ! ",
59+
" ? ",
60+
" ",
61+
" ",
62+
" ",
63+
]
64+
65+
_patterns_dict = list((re.compile(p), r) for p, r in zip(_patterns, _replacements))
66+
67+
68+
def _basic_english_normalize(line):
69+
r"""
70+
Basic normalization for a line of text.
71+
Normalization includes
72+
- lowercasing
73+
- complete some basic text normalization for English words as follows:
74+
add spaces before and after '\''
75+
remove '\"',
76+
add spaces before and after '.'
77+
replace '<br \/>'with single space
78+
add spaces before and after ','
79+
add spaces before and after '('
80+
add spaces before and after ')'
81+
add spaces before and after '!'
82+
add spaces before and after '?'
83+
replace ';' with single space
84+
replace ':' with single space
85+
replace multiple spaces with single space
86+
87+
Returns a list of tokens after splitting on whitespace.
88+
"""
89+
90+
line = line.lower()
91+
for pattern_re, replaced_str in _patterns_dict:
92+
line = pattern_re.sub(replaced_str, line)
93+
return line.split()
94+
95+
96+
def _split_tokenizer(x): # noqa: F821
97+
return x.split()
98+
99+
100+
def get_tokenizer(tokenizer, language="en"):
101+
r"""
102+
Generate tokenizer function for a string sentence.
103+
104+
Args:
105+
tokenizer: the name of tokenizer function. If None, it returns split()
106+
function, which splits the string sentence by space.
107+
If basic_english, it returns _basic_english_normalize() function,
108+
which normalize the string first and split by space. If a callable
109+
function, it will return the function. If a tokenizer library
110+
(e.g. spacy, moses, toktok, revtok, subword), it returns the
111+
corresponding library.
112+
language: Default en
113+
114+
Examples:
115+
>>> tokenizer = get_tokenizer("basic_english")
116+
>>> tokens = tokenizer("You can now install TorchText using pip!")
117+
>>> tokens
118+
>>> ['you', 'can', 'now', 'install', 'torchtext', 'using', 'pip', '!']
119+
120+
"""
121+
122+
# default tokenizer is string.split(), added as a module function for serialization
123+
if tokenizer is None:
124+
return _split_tokenizer
125+
126+
if tokenizer == "basic_english":
127+
if language != "en":
128+
raise ValueError("Basic normalization is only available for Enlish(en)")
129+
return _basic_english_normalize

ts/torch_handler/text_classifier.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
import torch
1010
import torch.nn.functional as F
1111
from captum.attr import TokenReferenceBase
12-
from torchtext.data.utils import ngrams_iterator
12+
13+
from ts.handler_utils.text_utils import ngrams_iterator
1314

1415
from ..utils.util import map_class_to_label
1516
from .text_handler import TextHandler

ts/torch_handler/text_handler.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
import torch
1313
import torch.nn.functional as F
1414
from captum.attr import LayerIntegratedGradients
15-
from torchtext.data.utils import get_tokenizer
15+
16+
from ts.handler_utils.text_utils import get_tokenizer
1617

1718
from ..utils.util import CLEANUP_REGEX
1819
from .base_handler import BaseHandler

ts_scripts/spellcheck_conf/wordlist.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1247,3 +1247,4 @@ quant
12471247
quantizing
12481248
smoothquant
12491249
woq
1250+
TorchText

0 commit comments

Comments
 (0)