Enable fx_graph_cache in gpt-fast example (#2935)

mreso · msaroufim · web-flow · commit e6654ece216f · 2024-02-10T05:37:06.000Z
* Enable fx_graph_cache in gpt-fast example

* mention fx_graph_cache in readme

* Fix spellcheck

* Update README.md

---------

Co-authored-by: Mark Saroufim &lt;marksaroufim@fb.com&gt;
diff --git a/examples/large_models/gpt_fast/README.md b/examples/large_models/gpt_fast/README.md
@@ -9,7 +9,7 @@ It features:
 * No dependencies other than PyTorch and sentencepiece
 * int8/int4 quantization
 * Speculative decoding
-* Tensor parallelism
+* Supports multi-GPU inference through Tensor parallelism
 * Supports Nvidia and AMD GPUs
 
 More details about gpt-fast can be found in this [blog](https://pytorch.org/blog/accelerating-generative-ai-2/).
@@ -81,6 +81,8 @@ cd ..
 At this stage we're creating the model archive which includes the configuration of our model in [model_config.yaml](./model_config.yaml).
 It's also the point where we need to decide if we want to deploy our model on a single or multiple GPUs.
 For the single GPU case we can use the default configuration that can be found in [model_config.yaml](./model_config.yaml).
+All configs enable the current prototyping feature FxGraphCache by setting fx_graph_cache to *true*.
+This feature stores the TorchInductor output in a cache to speed up torch.compile times when rerunning the handler.
 
 ```
 torch-model-archiver --model-name gpt_fast --version 1.0 --handler handler.py --config-file model_config.yaml --extra-files "gpt-fast/generate.py,gpt-fast/model.py,gpt-fast/quantize.py,gpt-fast/tp.py" --archive-format no-archive
diff --git a/examples/large_models/gpt_fast/handler.py b/examples/large_models/gpt_fast/handler.py
@@ -1,5 +1,6 @@
 import json
 import logging
+import os
 import time
 from pathlib import Path
 
@@ -84,6 +85,9 @@ def initialize(self, ctx):
         self.tokenizer = SentencePieceProcessor(model_file=str(tokenizer_path))
 
         if ctx.model_yaml_config["handler"]["compile"]:
+            if ctx.model_yaml_config["handler"].get("fx_graph_cache", False):
+                os.environ["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1"
+
             if self.is_speculative and use_tp:
                 torch._inductor.config.triton.cudagraph_trees = (
                     False  # Bug with cudagraph trees in this case
diff --git a/examples/large_models/gpt_fast/model_config.yaml b/examples/large_models/gpt_fast/model_config.yaml
@@ -8,3 +8,4 @@ handler:
     converted_ckpt_dir: "checkpoints/meta-llama/Llama-2-7b-hf/model.pth"
     max_new_tokens: 50
     compile: true
+    fx_graph_cache: True
diff --git a/examples/large_models/gpt_fast/model_config_speculative.yaml b/examples/large_models/gpt_fast/model_config_speculative.yaml
@@ -14,3 +14,4 @@ handler:
     max_new_tokens: 50
     compile: true
     stream: false
+    fx_graph_cache: True
diff --git a/examples/large_models/gpt_fast/model_config_tp.yaml b/examples/large_models/gpt_fast/model_config_tp.yaml
@@ -13,3 +13,4 @@ handler:
     max_new_tokens: 50
     compile: true
     stream: false
+    fx_graph_cache: True
diff --git a/test/pytest/test_example_gpt_fast.py b/test/pytest/test_example_gpt_fast.py
@@ -50,7 +50,7 @@
     {
         "nproc": 1,
         "stream": "true",
-        "compile": "false",
+        "compile": "true",
     },
     {
         "nproc": 4,
@@ -74,6 +74,7 @@
 EXPECTED_RESULTS = [
     # ", Paris, is a city of romance, fashion, and art. The city is home to the Eiffel Tower, the Louvre, and the Arc de Triomphe. Paris is also known for its cafes, restaurants",
     " is Paris.\nThe capital of Germany is Berlin.\nThe capital of Italy is Rome.\nThe capital of Spain is Madrid.\nThe capital of the United Kingdom is London.\nThe capital of the European Union is Brussels.\n",
+    " is Paris.\n\nThe capital of Germany is Berlin.\n\nThe capital of Italy is Rome.\n\nThe capital of Spain is Madrid.\n\nThe capital of the United Kingdom is London.\n\nThe capital of the United States is",
 ]
 
 
@@ -116,7 +117,7 @@ def test_handler(tmp_path, add_paths, compile, mocker):
         ctx.model_yaml_config = config
         ctx.request_ids = {0: "0"}
 
-        torch.manual_seed(42 * 42)
+        torch.manual_seed(42)
         handler.initialize(ctx)
 
         assert ("cuda:0" if torch.cuda.is_available() else "cpu") == str(handler.device)
@@ -129,7 +130,7 @@ def test_handler(tmp_path, add_paths, compile, mocker):
 
         result = "".join(c[0][0][0] for c in send_mock.call_args_list)
 
-        assert result == EXPECTED_RESULTS[0]
+        assert result == EXPECTED_RESULTS[1 if compile == "true" else 0]
     finally:
         # free memory in case of failed test
         del handler.model
@@ -241,4 +242,4 @@ def test_gpt_fast_mar(model_name_and_stdout):
 
     assert len(prediction) > 1
 
-    assert "".join(prediction) == EXPECTED_RESULTS[0]
+    assert "".join(prediction) == EXPECTED_RESULTS[1]
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1181,4 +1181,7 @@ Karpathy's
 Maher's
 warmup
 SOTA
+FxGraphCache
+TorchInductor
+fx
 locustapache