Make a copy of the torchtext utils to remove dependency (#3076)

agunapal · mreso · web-flow · commit a10aa452ff2c · 2024-06-06T20:55:13.000Z
* Make a copy of the torchtext utils to remove dependency

* Make a copy of the torchtext utils to remove dependency

* missing import

* Updated scriptable tokenizer example

* Updated scriptable tokenizer example

* added missing function

---------

Co-authored-by: Matthias Reso &lt;13337103+mreso@users.noreply.github.com&gt;
diff --git a/examples/text_classification_with_scriptable_tokenizer/README.md b/examples/text_classification_with_scriptable_tokenizer/README.md
@@ -1,5 +1,8 @@
 # Text Classification using a Scriptable Tokenizer
 
+## Deprecation Warning!
+This example requires TorchText which is deprecated. Please use version <= 0.11.0 of TorchServe for this example
+
 TorchScript is a way to serialize and optimize your PyTorch models.
 A scriptable tokenizer is a special tokenizer which is compatible with [TorchScript's compiler](https://pytorch.org/docs/stable/jit.html) so that it can be jointly serialized with a PyTorch model.
 When deploying an NLP model it is important to use the same tokenizer during training and inference to achieve the same model accuracy in both phases of the model live cycle.
diff --git a/test/pytest/conftest.py b/test/pytest/conftest.py
@@ -14,6 +14,7 @@
 collect_ignore.append("test_example_torchrec_dlrm.py")
 collect_ignore.append("test_example_near_real_time_video.py")
 collect_ignore.append("test_dali_preprocess.py")
+collect_ignore.append("test_example_scriptable_tokenzier.py")
 
 
 @pytest.fixture(scope="module")
diff --git a/ts/handler_utils/text_utils.py b/ts/handler_utils/text_utils.py
@@ -0,0 +1,129 @@
+"""
+Functions which have been copied from TorchText to remove TorchServe's
+dependency on TorchText
+
+from torchtext.data.utils import ngrams_iterator, get_tokenizer
+
+"""
+
+import re
+
+
+def ngrams_iterator(token_list, ngrams):
+    """Return an iterator that yields the given tokens and their ngrams.
+
+    Args:
+        token_list: A list of tokens
+        ngrams: the number of ngrams.
+
+    Examples:
+        >>> token_list = ['here', 'we', 'are']
+        >>> list(ngrams_iterator(token_list, 2))
+        >>> ['here', 'here we', 'we', 'we are', 'are']
+    """
+
+    def _get_ngrams(n):
+        return zip(*[token_list[i:] for i in range(n)])
+
+    for x in token_list:
+        yield x
+    for n in range(2, ngrams + 1):
+        for x in _get_ngrams(n):
+            yield " ".join(x)
+
+
+_patterns = [
+    r"\'",
+    r"\"",
+    r"\.",
+    r"<br \/>",
+    r",",
+    r"\(",
+    r"\)",
+    r"\!",
+    r"\?",
+    r"\;",
+    r"\:",
+    r"\s+",
+]
+
+_replacements = [
+    " '  ",
+    "",
+    " . ",
+    " ",
+    " , ",
+    " ( ",
+    " ) ",
+    " ! ",
+    " ? ",
+    " ",
+    " ",
+    " ",
+]
+
+_patterns_dict = list((re.compile(p), r) for p, r in zip(_patterns, _replacements))
+
+
+def _basic_english_normalize(line):
+    r"""
+    Basic normalization for a line of text.
+    Normalization includes
+    - lowercasing
+    - complete some basic text normalization for English words as follows:
+        add spaces before and after '\''
+        remove '\"',
+        add spaces before and after '.'
+        replace '<br \/>'with single space
+        add spaces before and after ','
+        add spaces before and after '('
+        add spaces before and after ')'
+        add spaces before and after '!'
+        add spaces before and after '?'
+        replace ';' with single space
+        replace ':' with single space
+        replace multiple spaces with single space
+
+    Returns a list of tokens after splitting on whitespace.
+    """
+
+    line = line.lower()
+    for pattern_re, replaced_str in _patterns_dict:
+        line = pattern_re.sub(replaced_str, line)
+    return line.split()
+
+
+def _split_tokenizer(x):  # noqa: F821
+    return x.split()
+
+
+def get_tokenizer(tokenizer, language="en"):
+    r"""
+    Generate tokenizer function for a string sentence.
+
+    Args:
+        tokenizer: the name of tokenizer function. If None, it returns split()
+            function, which splits the string sentence by space.
+            If basic_english, it returns _basic_english_normalize() function,
+            which normalize the string first and split by space. If a callable
+            function, it will return the function. If a tokenizer library
+            (e.g. spacy, moses, toktok, revtok, subword), it returns the
+            corresponding library.
+        language: Default en
+
+    Examples:
+        >>> tokenizer = get_tokenizer("basic_english")
+        >>> tokens = tokenizer("You can now install TorchText using pip!")
+        >>> tokens
+        >>> ['you', 'can', 'now', 'install', 'torchtext', 'using', 'pip', '!']
+
+    """
+
+    # default tokenizer is string.split(), added as a module function for serialization
+    if tokenizer is None:
+        return _split_tokenizer
+
+    if tokenizer == "basic_english":
+        if language != "en":
+            raise ValueError("Basic normalization is only available for Enlish(en)")
+        return _basic_english_normalize
diff --git a/ts/torch_handler/text_classifier.py b/ts/torch_handler/text_classifier.py
@@ -9,7 +9,8 @@
 import torch
 import torch.nn.functional as F
 from captum.attr import TokenReferenceBase
-from torchtext.data.utils import ngrams_iterator
+
+from ts.handler_utils.text_utils import ngrams_iterator
 
 from ..utils.util import map_class_to_label
 from .text_handler import TextHandler
diff --git a/ts/torch_handler/text_handler.py b/ts/torch_handler/text_handler.py
@@ -12,7 +12,8 @@
 import torch
 import torch.nn.functional as F
 from captum.attr import LayerIntegratedGradients
-from torchtext.data.utils import get_tokenizer
+
+from ts.handler_utils.text_utils import get_tokenizer
 
 from ..utils.util import CLEANUP_REGEX
 from .base_handler import BaseHandler
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1247,3 +1247,4 @@ quant
 quantizing
 smoothquant
 woq
+TorchText