Skip to content

Commit e212294

Browse files
authored
Use startup time in async worker thread instead of worker timeout (#3315)
* Use startup time in async worker thread instead of worker timeout * Fix lint * Update yaml files to use startupTimeout * Update vllm/lora readme
1 parent c6dde82 commit e212294

File tree

6 files changed

+21
-11
lines changed

6 files changed

+21
-11
lines changed

examples/large_models/vllm/llama3/model-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
minWorkers: 1
33
maxWorkers: 1
44
maxBatchDelay: 100
5-
responseTimeout: 1200
5+
startupTimeout: 1200
66
deviceType: "gpu"
77
asyncCommunication: true
88

examples/large_models/vllm/lora/Readme.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ The vllm integration uses an OpenAI compatible interface which lets you perform
5555

5656
Curl:
5757
```bash
58-
curl --header "Content-Type: application/json" --request POST --data @prompt.json http://localhost:8080/predictions/llama-8b-lora/1.0/v1
58+
curl --header "Content-Type: application/json" --request POST --data @prompt.json http://localhost:8080/predictions/llama-8b-lora/1.0/v1/completions
5959
```
6060

6161
Python + Request:

examples/large_models/vllm/lora/model-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
minWorkers: 1
33
maxWorkers: 1
44
maxBatchDelay: 100
5-
responseTimeout: 1200
5+
startupTimeout: 1200
66
deviceType: "gpu"
77
asyncCommunication: true
88

examples/large_models/vllm/mistral/model-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
minWorkers: 1
33
maxWorkers: 1
44
maxBatchDelay: 100
5-
responseTimeout: 1200
5+
startupTimeout: 1200
66
deviceType: "gpu"
77
asyncCommunication: true
88

frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java

+7-6
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
public class AsyncWorkerThread extends WorkerThread {
3434
// protected ConcurrentHashMap requestsInBackend;
3535
protected static final Logger logger = LoggerFactory.getLogger(AsyncWorkerThread.class);
36-
protected static final long MODEL_LOAD_TIMEOUT = 10L;
36+
protected static final long WORKER_TIMEOUT = 2L;
3737

3838
protected boolean loadingFinished;
3939
protected CountDownLatch latch;
@@ -53,6 +53,7 @@ public AsyncWorkerThread(
5353
@Override
5454
public void run() {
5555
responseTimeout = model.getResponseTimeout();
56+
startupTimeout = model.getStartupTimeout();
5657
Thread thread = Thread.currentThread();
5758
thread.setName(getWorkerName());
5859
currentThread.set(thread);
@@ -80,11 +81,11 @@ public void run() {
8081

8182
if (loadingFinished == false) {
8283
latch = new CountDownLatch(1);
83-
if (!latch.await(MODEL_LOAD_TIMEOUT, TimeUnit.MINUTES)) {
84+
if (!latch.await(startupTimeout, TimeUnit.SECONDS)) {
8485
throw new WorkerInitializationException(
85-
"Worker did not load the model within"
86-
+ MODEL_LOAD_TIMEOUT
87-
+ " mins");
86+
"Worker did not load the model within "
87+
+ startupTimeout
88+
+ " seconds");
8889
}
8990
}
9091

@@ -99,7 +100,7 @@ public void run() {
99100
logger.debug("Shutting down the thread .. Scaling down.");
100101
} else {
101102
logger.debug(
102-
"Backend worker monitoring thread interrupted or backend worker process died., responseTimeout:"
103+
"Backend worker monitoring thread interrupted or backend worker process died. responseTimeout:"
103104
+ responseTimeout
104105
+ "sec",
105106
e);

ts/llm_launcher.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ def get_model_config(args, model_snapshot_path=None):
6767
"batchSize": 1,
6868
"maxBatchDelay": 100,
6969
"responseTimeout": 1200,
70+
"startupTimeout": args.startup_timeout,
7071
"deviceType": "gpu",
7172
"asyncCommunication": True,
7273
}
@@ -227,7 +228,7 @@ def main(args):
227228
parser.add_argument(
228229
"--vllm_engine.max_num_seqs",
229230
type=int,
230-
default=16,
231+
default=256,
231232
help="Max sequences in vllm engine",
232233
)
233234

@@ -245,6 +246,13 @@ def main(args):
245246
help="Cache dir",
246247
)
247248

249+
parser.add_argument(
250+
"--startup_timeout",
251+
type=int,
252+
default=1200,
253+
help="Model startup timeout in seconds",
254+
)
255+
248256
parser.add_argument(
249257
"--engine",
250258
type=str,
@@ -272,6 +280,7 @@ def main(args):
272280
default=0.1,
273281
help="KV Cache free gpu memory fraction",
274282
)
283+
275284
args = parser.parse_args()
276285

277286
main(args)

0 commit comments

Comments
 (0)