updated model-config

lxning · lxning · commit 297c6769aab7 · 2024-03-21T15:34:36.000-07:00
diff --git a/examples/large_models/inferentia2/llama2/continuous_batching/inf2-llama-2-continuous-batching.ipynb b/examples/large_models/inferentia2/llama2/continuous_batching/inf2-llama-2-continuous-batching.ipynb
@@ -69,10 +69,10 @@
    "source": [
     "# login in Hugginface hub\n",
     "!huggingface-cli login --token $HUGGINGFACE_TOKEN\n",
-    "!python examples/large_models/utils/Download_model.py --model_path model --model_name meta-llama/Llama-2-13b-hf --use_auth_token True\n",
+    "!python examples/large_models/utils/Download_model.py --model_path model --model_name meta-llama/Llama-2-70b-hf --use_auth_token True\n",
     "\n",
     "# Create TorchServe model artifacts\n",
-    "!torch-model-archiver --model-name llama-2-70b --version 1.0 --handler ts/torch_handler/distributed/base_neuronx_continuous_batching_handler.py -r examples/large_models/inferentia2/llama2/requirements.txt --config-file examples/large_models/inferentia2/llama2/continuous_batching/model-config.yaml --archive-format no-archive\n",
+    "!torch-model-archiver --model-name llama-2-70b --version 1.0 --handler ts/torch_handler/distributed/base_neuronx_continuous_batching_handler.py -r examples/large_models/inferentia2/llama2/continuous_batching/requirements.txt --config-file examples/large_models/inferentia2/llama2/continuous_batching/model-config.yaml --archive-format no-archive\n",
     "\n",
     "!mkdir -p model_store\n",
     "!mv llama-2-70b model_store\n",
diff --git a/examples/large_models/inferentia2/llama2/streamer/model-config.yaml b/examples/large_models/inferentia2/llama2/streamer/model-config.yaml
@@ -5,10 +5,15 @@ responseTimeout: 10800
 batchSize: 16
 
 handler:
-    model_checkpoint_dir: "llama-2-13b-split"
+    model_path: "model/models--meta-llama--Llama-2-70b-hf/snapshots/90052941a64de02075ca800b09fcea1bdaacb939"
+    model_checkpoint_dir: "llama-2-70b-split"
+    model_module_prefix: "transformers_neuronx"
+    model_class_name: "llama.model.LlamaForSampling"
+    tokenizer_class_name: "transformers.LlamaTokenizer"
     amp: "bf16"
-    tp_degree: 6
-    max_length: 100
+    tp_degree: 24
+    max_length: 256
+    max_new_tokens: 50
 
 micro_batching:
     micro_batch_size: 4
diff --git a/examples/large_models/inferentia2/llama2/streamer/requirements.txt b/examples/large_models/inferentia2/llama2/streamer/requirements.txt
@@ -0,0 +1,2 @@
+transformers==4.36.2
+sentencepiece==0.1.99

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+transformers==4.36.2`
	`2`	`+sentencepiece==0.1.99`