From d865e09d9d62073ff6d723a7a48058dd04784ae4 Mon Sep 17 00:00:00 2001 From: ylavic Date: Fri, 23 Jun 2023 19:16:05 +0200 Subject: [PATCH 01/22] core: Change filter input/output pending API to return OK/AGAIN/DONE. Rename the ap_filter_input_pending() and ap_filter_output_pending() core hooks to ap_core_input_pending() and ap_core_output_pending() respectively. Change them to return AGAIN when some data are pending. Add ap_check_input_pending() and ap_check_output_pending() to run the hooks and fix the return value (DECLINED -> OK, c->aborted -> DONE). Adapt the callers to the new API (DECLINED -> OK, OK -> AGAIN, DONE). --- include/ap_mmn.h | 4 ++- include/httpd.h | 2 ++ include/mpm_common.h | 18 ------------- include/util_filter.h | 48 ++++++++++++++++++++++++++++------- modules/http/http_request.c | 2 +- modules/proxy/proxy_util.c | 10 ++++---- server/core.c | 4 +-- server/mpm/event/event.c | 12 ++++----- server/mpm/motorz/motorz.c | 24 ++++++++++-------- server/mpm/simple/simple_io.c | 23 ++++++++++------- server/util_filter.c | 41 ++++++++++++++++++++++-------- 11 files changed, 117 insertions(+), 71 deletions(-) diff --git a/include/ap_mmn.h b/include/ap_mmn.h index ad4b77bd33a..acfa61e22b5 100644 --- a/include/ap_mmn.h +++ b/include/ap_mmn.h @@ -731,6 +731,8 @@ * and AP_REQUEST_TRUSTED_CT BNOTE. * 20211221.24 (2.5.1-dev) Add ap_proxy_fixup_uds_filename() * 20211221.25 (2.5.1-dev) AP_SLASHES and AP_IS_SLASH + * 20211221.26 (2.5.1-dev) Add AGAIN, ap_check_input_pending() and + * ap_check_output_pending() */ #define MODULE_MAGIC_COOKIE 0x41503235UL /* "AP25" */ @@ -738,7 +740,7 @@ #ifndef MODULE_MAGIC_NUMBER_MAJOR #define MODULE_MAGIC_NUMBER_MAJOR 20211221 #endif -#define MODULE_MAGIC_NUMBER_MINOR 25 /* 0...n */ +#define MODULE_MAGIC_NUMBER_MINOR 26 /* 0...n */ /** * Determine if the server's current MODULE_MAGIC_NUMBER is at least a diff --git a/include/httpd.h b/include/httpd.h index c02b3b7849b..c3f72fceb7e 100644 --- a/include/httpd.h +++ b/include/httpd.h @@ -465,6 +465,8 @@ AP_DECLARE(const char *) ap_get_server_built(void); */ #define SUSPENDED -3 /**< Module will handle the remainder of the request. * The core will never invoke the request again */ +#define AGAIN -4 /**< Module wants to be called again when + * more data is availble */ /** Returned by the bottom-most filter if no data was written. * @see ap_pass_brigade(). */ diff --git a/include/mpm_common.h b/include/mpm_common.h index 334624ee065..34c61e2a6c2 100644 --- a/include/mpm_common.h +++ b/include/mpm_common.h @@ -450,24 +450,6 @@ AP_DECLARE_HOOK(apr_status_t, mpm_resume_suspended, (conn_rec*)) */ AP_DECLARE_HOOK(const char *,mpm_get_name,(void)) -/** - * Hook called to determine whether we should stay within the write completion - * phase. - * @param c The current connection - * @return OK if write completion should continue, DECLINED if write completion - * should end gracefully, or a positive error if we should begin to linger. - * @ingroup hooks - */ -AP_DECLARE_HOOK(int, output_pending, (conn_rec *c)) - -/** - * Hook called to determine whether any data is pending in the input filters. - * @param c The current connection - * @return OK if we can read without blocking, DECLINED if a read would block. - * @ingroup hooks - */ -AP_DECLARE_HOOK(int, input_pending, (conn_rec *c)) - /** * Notification that connection handling is suspending (disassociating from the * current thread) diff --git a/include/util_filter.h b/include/util_filter.h index a03e81c16ca..6a21c486810 100644 --- a/include/util_filter.h +++ b/include/util_filter.h @@ -645,16 +645,15 @@ AP_DECLARE(void) ap_filter_adopt_brigade(ap_filter_t *f, AP_DECLARE(int) ap_filter_should_yield(ap_filter_t *f); /** - * This function determines whether there is unwritten data in the output + * This function determines whether there is pending data in the output * filters, and if so, attempts to make a single write to each filter - * with unwritten data. + * with pending data. * * @param c The connection. - * @return If no unwritten data remains, this function returns DECLINED. - * If some unwritten data remains, this function returns OK. If any - * attempt to write data failed, this functions returns a positive integer. + * @return OK if no pending data remain, AGAIN if some remain, DONE + * if the connection is aborted, anything else on error. */ -AP_DECLARE_NONSTD(int) ap_filter_output_pending(conn_rec *c); +AP_DECLARE(int) ap_check_output_pending(conn_rec *c); /** * This function determines whether there is pending data in the input @@ -662,10 +661,41 @@ AP_DECLARE_NONSTD(int) ap_filter_output_pending(conn_rec *c); * socket but not yet returned to the application. * * @param c The connection. - * @return If no pending data remains, this function returns DECLINED. - * If some pending data remains, this function returns OK. + * @return OK if no pending data remain, AGAIN if some remain, DONE + * if the connection is aborted, anything else on error. */ -AP_DECLARE_NONSTD(int) ap_filter_input_pending(conn_rec *c); +AP_DECLARE(int) ap_check_input_pending(conn_rec *c); + +/** + * Hook called to determine whether we should stay within the write completion + * phase. + * @param c The current connection + * @return OK if we can write without blocking, AGAIN if a write would block, + * DECLINED to let the next hook decide, DONE if the connection is aborted, + * anything else on error. + * @ingroup hooks + */ +AP_DECLARE_HOOK(int, output_pending, (conn_rec *c)) + +/** + * Hook called to determine whether any data is pending in the input filters. + * @param c The current connection + * @return OK if we can read without blocking, AGAIN if a read would block, + * DECLINED to let the next hook decide, DONE if the connection is aborted, + * anything else on error. + * @ingroup hooks + */ +AP_DECLARE_HOOK(int, input_pending, (conn_rec *c)) + +/** + * The core output_pending hook. + */ +AP_DECLARE_NONSTD(int) ap_core_output_pending(conn_rec *c); + +/** + * The core input_pending hook. + */ +AP_DECLARE_NONSTD(int) ap_core_input_pending(conn_rec *c); /** * Flush function for apr_brigade_* calls. This calls ap_pass_brigade diff --git a/modules/http/http_request.c b/modules/http/http_request.c index 65c389125a7..77bf63edc5f 100644 --- a/modules/http/http_request.c +++ b/modules/http/http_request.c @@ -484,7 +484,7 @@ AP_DECLARE(void) ap_process_request(request_rec *r) ap_process_async_request(r); - if (ap_run_input_pending(c) != OK) { + if (ap_check_input_pending(c) != AGAIN) { bb = ap_acquire_brigade(c); b = apr_bucket_flush_create(c->bucket_alloc); APR_BRIGADE_INSERT_HEAD(bb, b); diff --git a/modules/proxy/proxy_util.c b/modules/proxy/proxy_util.c index cbc31104c37..88d174220d8 100644 --- a/modules/proxy/proxy_util.c +++ b/modules/proxy/proxy_util.c @@ -5888,12 +5888,12 @@ PROXY_DECLARE(int) ap_proxy_tunnel_run(proxy_tunnel_rec *tunnel) "proxy: %s: %s output ready", scheme, out->name); - rc = ap_filter_output_pending(out->c); - if (rc == OK) { - /* Keep polling out (only) */ + rc = ap_check_output_pending(out->c); + if (rc == AGAIN) { + /* Keep polling (OUT only) */ continue; } - if (rc != DECLINED) { + if (rc != OK) { /* Real failure, bail out */ ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(10221) "proxy: %s: %s flushing failed (%i)", @@ -5923,7 +5923,7 @@ PROXY_DECLARE(int) ap_proxy_tunnel_run(proxy_tunnel_rec *tunnel) /* Flush any pending input data now, we don't know when * the next POLLIN will trigger and retaining data might * deadlock the underlying protocol. We don't check for - * pending data first with ap_filter_input_pending() since + * pending data first with ap_check_input_pending() since * the read from proxy_tunnel_transfer() is nonblocking * anyway and returning OK if there's no data. */ diff --git a/server/core.c b/server/core.c index 9f92981ef0d..4d5d569d93b 100644 --- a/server/core.c +++ b/server/core.c @@ -6150,9 +6150,9 @@ static void register_hooks(apr_pool_t *p) ap_hook_get_pollfd_from_conn(core_get_pollfd_from_conn, NULL, NULL, APR_HOOK_REALLY_LAST); - ap_hook_input_pending(ap_filter_input_pending, NULL, NULL, + ap_hook_input_pending(ap_core_input_pending, NULL, NULL, APR_HOOK_MIDDLE); - ap_hook_output_pending(ap_filter_output_pending, NULL, NULL, + ap_hook_output_pending(ap_core_output_pending, NULL, NULL, APR_HOOK_MIDDLE); /* register the core's insert_filter hook and register core-provided diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 403f9a3c939..ab49f34cf44 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -1233,7 +1233,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc } if (cs->pub.state == CONN_STATE_WRITE_COMPLETION) { - int pending = DECLINED; + int pending = OK; /* Flush all pending outputs before going to CONN_STATE_KEEPALIVE or * straight to CONN_STATE_PROCESSING if inputs are pending already. @@ -1243,12 +1243,12 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc if (from_wc_q) { from_wc_q = 0; /* one shot */ - pending = ap_run_output_pending(c); + pending = ap_check_output_pending(c); } else if (ap_filter_should_yield(c->output_filters)) { - pending = OK; + pending = AGAIN; } - if (pending == OK) { + if (pending == AGAIN) { /* Let the event thread poll for write */ cs->queue_timestamp = apr_time_now(); notify_suspend(cs); @@ -1274,11 +1274,11 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc } return; } - if (pending != DECLINED || c->aborted || c->keepalive != AP_CONN_KEEPALIVE) { + if (pending != OK || c->aborted || c->keepalive != AP_CONN_KEEPALIVE) { cs->pub.state = CONN_STATE_LINGER; goto lingering_close; } - if (ap_run_input_pending(c) == OK) { + if (ap_check_input_pending(c) == AGAIN) { goto process_connection; } if (listener_may_exit) { diff --git a/server/mpm/motorz/motorz.c b/server/mpm/motorz/motorz.c index 8feff2965c2..7026d08cd6e 100644 --- a/server/mpm/motorz/motorz.c +++ b/server/mpm/motorz/motorz.c @@ -408,8 +408,8 @@ static apr_status_t motorz_io_process(motorz_conn_t *scon) ap_update_child_status(scon->sbh, SERVER_BUSY_WRITE, NULL); - pending = ap_run_output_pending(c); - if (pending == OK) { + pending = ap_check_output_pending(c); + if (pending == AGAIN) { /* Still in WRITE_COMPLETION_STATE: * Set a write timeout for this connection, and let the * event thread poll for writeability. @@ -432,18 +432,22 @@ static apr_status_t motorz_io_process(motorz_conn_t *scon) } return APR_SUCCESS; } - if (pending != DECLINED - || c->keepalive != AP_CONN_KEEPALIVE - || c->aborted) { - scon->cs.state = CONN_STATE_LINGER; + if (c->keepalive != AP_CONN_KEEPALIVE) { + pending = DONE; } - else if (ap_run_input_pending(c) == OK) { - scon->cs.state = CONN_STATE_PROCESSING; - goto read_request; + else if (pending == OK) { + pending = ap_check_input_pending(c); + if (pending == AGAIN) { + scon->cs.state = CONN_STATE_PROCESSING; + goto read_request; + } } - else { + if (pending == OK) { scon->cs.state = CONN_STATE_KEEPALIVE; } + else { + scon->cs.state = CONN_STATE_LINGER; + } } if (scon->cs.state == CONN_STATE_LINGER) { diff --git a/server/mpm/simple/simple_io.c b/server/mpm/simple/simple_io.c index fb509ed756a..36c5ad87956 100644 --- a/server/mpm/simple/simple_io.c +++ b/server/mpm/simple/simple_io.c @@ -96,8 +96,8 @@ static apr_status_t simple_io_process(simple_conn_t * scon) int pending; ap_update_child_status(c->sbh, SERVER_BUSY_WRITE, NULL); - pending = ap_run_output_pending(c); - if (pending == OK) { + pending = ap_check_output_pending(c); + if (pending == AGAIN) { /* Still in WRITE_COMPLETION_STATE: * Set a write timeout for this connection, and let the * event thread poll for writeability. @@ -126,17 +126,22 @@ static apr_status_t simple_io_process(simple_conn_t * scon) } return APR_SUCCESS; } - if (pending != DECLINED - || c->keepalive != AP_CONN_KEEPALIVE - || c->aborted) { - scon->cs.state = CONN_STATE_LINGER; + if (c->keepalive != AP_CONN_KEEPALIVE) { + pending = DONE; } - else if (ap_run_input_pending(c) == OK) { - scon->cs.state = CONN_STATE_PROCESSING; + else if (pending == OK) { + pending = ap_check_input_pending(c); + if (pending == AGAIN) { + scon->cs.state = CONN_STATE_PROCESSING; + continue; + } } - else { + if (pending == OK) { scon->cs.state = CONN_STATE_KEEPALIVE; } + else { + scon->cs.state = CONN_STATE_LINGER; + } } if (scon->cs.state == CONN_STATE_LINGER) { diff --git a/server/util_filter.c b/server/util_filter.c index 3b1e96fb447..d8dc103d80f 100644 --- a/server/util_filter.c +++ b/server/util_filter.c @@ -393,7 +393,7 @@ static apr_status_t request_filter_cleanup(void *arg) /* A request filter is cleaned up with an EOR bucket, so possibly * while it is handling/passing the EOR, and we want each filter or - * ap_filter_output_pending() to be able to dereference f until they + * ap_check_output_pending() to be able to dereference f until they * return. So request filters are recycled in dead_filters and will only * be moved to spare_filters when recycle_dead_filters() is called, i.e. * in ap_filter_{in,out}put_pending(). Set f->r to NULL still for any use @@ -978,7 +978,7 @@ AP_DECLARE(apr_status_t) ap_filter_setaside_brigade(ap_filter_t *f, e = next) { next = APR_BUCKET_NEXT(e); - /* WC buckets will be added back by ap_filter_output_pending() + /* WC buckets will be added back by ap_check_output_pending() * at the tail. */ if (AP_BUCKET_IS_WC(e)) { @@ -1267,7 +1267,7 @@ AP_DECLARE(int) ap_filter_should_yield(ap_filter_t *f) return 0; } -AP_DECLARE_NONSTD(int) ap_filter_output_pending(conn_rec *c) +AP_DECLARE_NONSTD(int) ap_core_output_pending(conn_rec *c) { struct ap_filter_conn_ctx *x = c->filter_conn_ctx; struct ap_filter_private *fp, *prev; @@ -1312,7 +1312,7 @@ AP_DECLARE_NONSTD(int) ap_filter_output_pending(conn_rec *c) } if (ap_filter_should_yield(f)) { - rc = OK; + rc = AGAIN; break; } } @@ -1320,15 +1320,26 @@ AP_DECLARE_NONSTD(int) ap_filter_output_pending(conn_rec *c) ap_release_brigade(c, bb); cleanup: - /* All filters have returned, time to recycle/unleak ap_filter_t-s + /* All filters have returned, time to recycle/unleak dead filters * before leaving (i.e. make them reusable). */ recycle_dead_filters(c); return rc; } +AP_DECLARE(int) ap_check_output_pending(conn_rec *c) +{ + int rc = ap_run_output_pending(c); + if (rc == DECLINED) { + rc = OK; + } + if (rc == OK && c->aborted) { + rc = DONE; + } + return rc; +} -AP_DECLARE_NONSTD(int) ap_filter_input_pending(conn_rec *c) +AP_DECLARE_NONSTD(int) ap_core_input_pending(conn_rec *c) { struct ap_filter_conn_ctx *x = c->filter_conn_ctx; struct ap_filter_private *fp; @@ -1349,21 +1360,31 @@ AP_DECLARE_NONSTD(int) ap_filter_input_pending(conn_rec *c) */ AP_DEBUG_ASSERT(fp->bb); e = APR_BRIGADE_FIRST(fp->bb); - if (e != APR_BRIGADE_SENTINEL(fp->bb) - && e->length != (apr_size_t)(-1)) { - rc = OK; + if (e != APR_BRIGADE_SENTINEL(fp->bb) && e->length != (apr_size_t)-1) { + rc = AGAIN; break; } } cleanup: - /* All filters have returned, time to recycle/unleak ap_filter_t-s + /* All filters have returned, time to recycle/unleak dead filters * before leaving (i.e. make them reusable). */ recycle_dead_filters(c); return rc; } +AP_DECLARE(int) ap_check_input_pending(conn_rec *c) +{ + int rc = ap_run_input_pending(c); + if (rc == DECLINED) { + rc = OK; + } + if (rc == OK && c->aborted) { + rc = DONE; + } + return rc; +} AP_DECLARE_NONSTD(apr_status_t) ap_filter_flush(apr_bucket_brigade *bb, void *ctx) From a92dd49afaeb3cbaec448c17cf8ab0eee18f8e65 Mon Sep 17 00:00:00 2001 From: ylavic Date: Tue, 11 Jul 2023 20:26:41 +0200 Subject: [PATCH 02/22] mpm_event,listen: Improve/fix children maintenance when num_buckets > 1. --- server/listen.c | 47 +++++----- server/mpm/event/event.c | 184 +++++++++++++++++++-------------------- 2 files changed, 113 insertions(+), 118 deletions(-) diff --git a/server/listen.c b/server/listen.c index 3aed907e21b..f32826e4774 100644 --- a/server/listen.c +++ b/server/listen.c @@ -333,9 +333,7 @@ static apr_status_t alloc_systemd_listener(process_rec * process, si.type = SOCK_STREAM; si.protocol = APR_PROTO_TCP; - rec = apr_palloc(process->pool, sizeof(ap_listen_rec)); - rec->active = 0; - rec->next = 0; + rec = apr_pcalloc(process->pool, sizeof(ap_listen_rec)); rv = apr_os_sock_make(&rec->sd, &si, process->pool); if (rv != APR_SUCCESS) { @@ -462,8 +460,8 @@ static const char *alloc_listener(process_rec *process, const char *addr, apr_pool_t *temp_pool, apr_uint32_t flags) { ap_listen_rec *last; + apr_sockaddr_t *sa, *next_sa; apr_status_t status; - apr_sockaddr_t *sa; /* see if we've got a listener for this address:port, which is an error */ if (find_listeners(&ap_listeners, NULL, addr, port, scope_id, temp_pool)) { @@ -494,22 +492,23 @@ static const char *alloc_listener(process_rec *process, const char *addr, last = last->next; } - while (sa) { + for (; sa; sa = next_sa) { ap_listen_rec *new; + /* Each listener has its own (unlinked) address */ + next_sa = sa->next; + sa->next = NULL; + /* this has to survive restarts */ new = apr_palloc(process->pool, sizeof(ap_listen_rec)); new->active = 0; - new->next = 0; + new->next = NULL; new->bind_addr = sa; new->protocol = apr_pstrdup(process->pool, proto); new->flags = flags; - /* Go to the next sockaddr. */ - sa = sa->next; - status = apr_socket_create(&new->sd, new->bind_addr->family, - SOCK_STREAM, 0, process->pool); + SOCK_STREAM, 0, process->pool); #if APR_HAVE_IPV6 /* What could happen is that we got an IPv6 address, but this system @@ -861,36 +860,36 @@ AP_DECLARE(apr_status_t) ap_duplicate_listeners(apr_pool_t *p, server_rec *s, lr = ap_listeners; while (lr) { ap_listen_rec *duplr; - char *hostname; - apr_port_t port; - apr_sockaddr_t *sa; #ifdef HAVE_SYSTEMD if (use_systemd) { int thesock; apr_os_sock_get(&thesock, lr->sd); if ((stat = alloc_systemd_listener(s->process, thesock, - lr->protocol, &duplr)) != APR_SUCCESS) { + lr->protocol, &duplr))) { return stat; } } else #endif { - duplr = apr_palloc(p, sizeof(ap_listen_rec)); - duplr->slave = NULL; + duplr = apr_pcalloc(p, sizeof(ap_listen_rec)); duplr->protocol = apr_pstrdup(p, lr->protocol); - hostname = apr_pstrdup(p, lr->bind_addr->hostname); - port = lr->bind_addr->port; - stat = apr_sockaddr_info_get(&sa, hostname, APR_UNSPEC, port, 0, p); + duplr->flags = lr->flags; +#if APR_VERSION_AT_LEAST(1,6,0) + stat = apr_sockaddr_info_copy(&duplr->bind_addr, + lr->bind_addr, p); +#else + stat = apr_sockaddr_info_get(&duplr->bind_addr, + lr->bind_addr->hostname, + lr->bind_addr->family, + lr->bind_addr->port, 0, p); +#endif if (stat != APR_SUCCESS) { ap_log_perror(APLOG_MARK, APLOG_CRIT, stat, p, APLOGNO(10397) - "failure looking up %s to duplicate " - "listening socket", hostname); + "failure duplicating address %pI for " + "listening socket", lr->bind_addr); return stat; } - duplr->bind_addr = sa; - duplr->next = NULL; - duplr->flags = lr->flags; stat = apr_socket_create(&duplr->sd, duplr->bind_addr->family, SOCK_STREAM, 0, p); if (stat != APR_SUCCESS) { diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index ab49f34cf44..29a7b2bd345 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -436,13 +436,15 @@ typedef struct event_retained_data { * Workers that still active, i.e. are not shutting down gracefully. */ int active_daemons; + /* * idle_spawn_rate is the number of children that will be spawned on the * next maintenance cycle if there aren't enough idle servers. It is - * maintained per listeners bucket, doubled up to MAX_SPAWN_RATE, and - * reset only when a cycle goes by without the need to spawn. + * doubled up to MAX_SPAWN_RATE, and reset only when a cycle goes by + * without the need to spawn. */ - int *idle_spawn_rate; + int idle_spawn_rate; + int max_spawn_rate, *free_slots; int hold_off_on_exponential_spawning; } event_retained_data; static event_retained_data *retained; @@ -450,7 +452,6 @@ static event_retained_data *retained; #ifndef MAX_SPAWN_RATE #define MAX_SPAWN_RATE 32 #endif -static int max_spawn_rate_per_bucket = MAX_SPAWN_RATE / 1; struct event_srv_cfg_s { struct timeout_queue *io_q, @@ -3144,9 +3145,9 @@ static void child_main(int child_num_arg, int child_bucket) clean_child_exit(resource_shortage ? APEXIT_CHILDSICK : 0); } -static int make_child(server_rec * s, int slot, int bucket) +static int make_child(server_rec *s, int slot) { - int pid; + int pid, bucket = slot % retained->mpm->num_buckets; if (slot + 1 > retained->max_daemon_used) { retained->max_daemon_used = slot + 1; @@ -3230,32 +3231,27 @@ static void startup_children(int number_to_start) if (ap_scoreboard_image->parent[i].pid != 0) { continue; } - if (make_child(ap_server_conf, i, i % retained->mpm->num_buckets) < 0) { + if (make_child(ap_server_conf, i) < 0) { break; } --number_to_start; } } -static void perform_idle_server_maintenance(int child_bucket, - int *max_daemon_used) +static void perform_idle_server_maintenance(void) { - int num_buckets = retained->mpm->num_buckets; - int idle_thread_count = 0; - process_score *ps; - int free_length = 0; - int free_slots[MAX_SPAWN_RATE]; + volatile process_score *ps; + const int num_buckets = retained->mpm->num_buckets; int last_non_dead = -1; + int free_length = 0, free_bucket = 0; + int max_daemon_used = 0; + int idle_thread_count = 0; int active_thread_count = 0; int i, j; for (i = 0; i < server_limit; ++i) { - if (num_buckets > 1 && (i % num_buckets) != child_bucket) { - /* We only care about child_bucket in this call */ - continue; - } if (i >= retained->max_daemon_used && - free_length == retained->idle_spawn_rate[child_bucket]) { + free_length == retained->idle_spawn_rate) { /* short cut if all active processes have been examined and * enough empty scoreboard slots have been found */ @@ -3299,12 +3295,16 @@ static void perform_idle_server_maintenance(int child_bucket, } last_non_dead = i; } - else if (free_length < retained->idle_spawn_rate[child_bucket]) { - free_slots[free_length++] = i; + else if (free_length < retained->idle_spawn_rate + && (i % num_buckets) == free_bucket) { + retained->free_slots[free_length++] = i; + if (++free_bucket == num_buckets) { + free_bucket = 0; + } } } - if (*max_daemon_used < last_non_dead + 1) { - *max_daemon_used = last_non_dead + 1; + if (max_daemon_used < last_non_dead + 1) { + max_daemon_used = last_non_dead + 1; } if (retained->sick_child_detected) { @@ -3315,10 +3315,6 @@ static void perform_idle_server_maintenance(int child_bucket, */ retained->sick_child_detected = 0; } - else if (child_bucket < num_buckets - 1) { - /* check for had_healthy_child up to the last child bucket */ - return; - } else { /* looks like a basket case, as no child ever fully initialized; give up. */ @@ -3338,7 +3334,7 @@ static void perform_idle_server_maintenance(int child_bucket, && retained->total_daemons <= retained->max_daemon_used && retained->max_daemon_used <= server_limit); - if (idle_thread_count > max_spare_threads / num_buckets) { + if (idle_thread_count > max_spare_threads) { /* * Child processes that we ask to shut down won't die immediately * but may stay around for a long time when they finish their @@ -3367,17 +3363,19 @@ static void perform_idle_server_maintenance(int child_bucket, retained->total_daemons, retained->max_daemon_used, server_limit, idle_thread_count, max_workers); if (do_kill) { - ap_mpm_podx_signal(retained->buckets[child_bucket].pod, - AP_MPM_PODX_GRACEFUL); + for (i = 0; i < num_buckets; ++i) { + ap_mpm_podx_signal(retained->buckets[i].pod, + AP_MPM_PODX_GRACEFUL); + } } else { /* Wait for dying daemon(s) to exit */ } - retained->idle_spawn_rate[child_bucket] = 1; + retained->idle_spawn_rate = num_buckets; } - else if (idle_thread_count < min_spare_threads / num_buckets) { - if (active_thread_count >= max_workers / num_buckets) { - if (0 == idle_thread_count) { + else if (idle_thread_count < min_spare_threads) { + if (active_thread_count >= max_workers) { + if (0 == idle_thread_count) { if (!retained->maxclients_reported) { ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf, APLOGNO(00484) "server reached MaxRequestWorkers setting, " @@ -3395,18 +3393,15 @@ static void perform_idle_server_maintenance(int child_bucket, retained->near_maxclients_reported = 1; } } - retained->idle_spawn_rate[child_bucket] = 1; + retained->idle_spawn_rate = num_buckets; } else if (free_length == 0) { /* scoreboard is full, can't fork */ ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf, APLOGNO(03490) "scoreboard is full, not at MaxRequestWorkers." "Increase ServerLimit."); - retained->idle_spawn_rate[child_bucket] = 1; + retained->idle_spawn_rate = num_buckets; } else { - if (free_length > retained->idle_spawn_rate[child_bucket]) { - free_length = retained->idle_spawn_rate[child_bucket]; - } if (free_length + retained->active_daemons > active_daemons_limit) { if (retained->active_daemons < active_daemons_limit) { free_length = active_daemons_limit - retained->active_daemons; @@ -3418,14 +3413,13 @@ static void perform_idle_server_maintenance(int child_bucket, "total %d/%d/%d, rate %d", free_length, retained->active_daemons, active_daemons_limit, retained->total_daemons, retained->max_daemon_used, - server_limit, retained->idle_spawn_rate[child_bucket]); + server_limit, retained->idle_spawn_rate); /* reset the spawning rate and prevent its growth below */ - retained->idle_spawn_rate[child_bucket] = 1; - ++retained->hold_off_on_exponential_spawning; + retained->idle_spawn_rate = num_buckets; free_length = 0; } } - if (retained->idle_spawn_rate[child_bucket] >= 8) { + if (retained->idle_spawn_rate >= retained->max_spawn_rate / 4) { ap_log_error(APLOG_MARK, APLOG_INFO, 0, ap_server_conf, APLOGNO(00486) "server seems busy, (you may need " "to increase StartServers, ThreadsPerChild " @@ -3436,13 +3430,14 @@ static void perform_idle_server_maintenance(int child_bucket, idle_thread_count, retained->active_daemons, retained->total_daemons); } + free_length = (free_length / num_buckets) * num_buckets; for (i = 0; i < free_length; ++i) { - int slot = free_slots[i]; - if (make_child(ap_server_conf, slot, child_bucket) < 0) { + int slot = retained->free_slots[i]; + if (make_child(ap_server_conf, slot) < 0) { continue; } - if (*max_daemon_used < slot + 1) { - *max_daemon_used = slot + 1; + if (max_daemon_used < slot + 1) { + max_daemon_used = slot + 1; } } /* the next time around we want to spawn twice as many if this @@ -3451,31 +3446,41 @@ static void perform_idle_server_maintenance(int child_bucket, if (retained->hold_off_on_exponential_spawning) { --retained->hold_off_on_exponential_spawning; } - else if (retained->idle_spawn_rate[child_bucket] - < max_spawn_rate_per_bucket) { - int new_rate = retained->idle_spawn_rate[child_bucket] * 2; - if (new_rate > max_spawn_rate_per_bucket) { - new_rate = max_spawn_rate_per_bucket; + else if (free_length && retained->idle_spawn_rate < retained->max_spawn_rate) { + int new_rate = retained->idle_spawn_rate * 2; + new_rate = ((new_rate + num_buckets - 1) / num_buckets) * num_buckets; + if (new_rate > retained->max_spawn_rate) { + new_rate = retained->max_spawn_rate; } - retained->idle_spawn_rate[child_bucket] = new_rate; + retained->idle_spawn_rate = new_rate; } } } else { - retained->idle_spawn_rate[child_bucket] = 1; + retained->idle_spawn_rate = num_buckets; + } + + retained->max_daemon_used = max_daemon_used; + if (APLOGdebug(ap_server_conf)) { + ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, + "score: idlers:%d, " + "threads active:%d/%d max:%d, " + "daemons active:%d/%d max:%d used:%d/%d/%d", + idle_thread_count, + active_thread_count, retained->active_daemons * threads_per_child, + max_workers, retained->active_daemons, retained->total_daemons, + active_daemons_limit, max_daemon_used, retained->max_daemon_used, + server_limit); } } static void server_main_loop(int remaining_children_to_start) { - int num_buckets = retained->mpm->num_buckets; - int max_daemon_used = 0; int successive_kills = 0; int child_slot; apr_exit_why_e exitwhy; int status, processed_status; apr_proc_t pid; - int i; while (!retained->mpm->restart_pending && !retained->mpm->shutdown_pending) { ap_wait_or_timeout(&exitwhy, &status, &pid, pconf, ap_server_conf); @@ -3520,14 +3525,13 @@ static void server_main_loop(int remaining_children_to_start) if (processed_status == APEXIT_CHILDSICK) { /* resource shortage, minimize the fork rate */ - retained->idle_spawn_rate[child_slot % num_buckets] = 1; + retained->idle_spawn_rate = retained->mpm->num_buckets; } else if (remaining_children_to_start) { /* we're still doing a 1-for-1 replacement of dead * children with new children */ - make_child(ap_server_conf, child_slot, - child_slot % num_buckets); + make_child(ap_server_conf, child_slot); --remaining_children_to_start; } } @@ -3586,11 +3590,7 @@ static void server_main_loop(int remaining_children_to_start) continue; } - max_daemon_used = 0; - for (i = 0; i < num_buckets; i++) { - perform_idle_server_maintenance(i, &max_daemon_used); - } - retained->max_daemon_used = max_daemon_used; + perform_idle_server_maintenance(); } } @@ -3680,35 +3680,36 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s) retained->buckets[i].listeners = listen_buckets[i]; } - if (retained->mpm->max_buckets < num_buckets) { - int new_max, *new_ptr; - new_max = retained->mpm->max_buckets * 2; - if (new_max < num_buckets) { - new_max = num_buckets; + /* If num_buckets changed, adjust max_spawn_rate and the free_slots buffer */ + if (retained->mpm->num_buckets != num_buckets) { + if (retained->mpm->max_buckets < num_buckets) { + int new_max, new_slots; + new_max = retained->mpm->max_buckets * 2; + if (new_max < num_buckets) { + new_max = num_buckets; + } + else { + new_max = ((new_max + num_buckets - 1) / num_buckets) * num_buckets; + } + new_slots = ((MAX_SPAWN_RATE + new_max - 1) / new_max) * new_max; + retained->free_slots = apr_palloc(ap_pglobal, new_slots * sizeof(int)); + retained->mpm->max_buckets = new_max; } - new_ptr = (int *)apr_palloc(ap_pglobal, new_max * sizeof(int)); - if (retained->mpm->num_buckets) /* idle_spawn_rate NULL at startup */ - memcpy(new_ptr, retained->idle_spawn_rate, - retained->mpm->num_buckets * sizeof(int)); - retained->idle_spawn_rate = new_ptr; - retained->mpm->max_buckets = new_max; - } - if (retained->mpm->num_buckets < num_buckets) { - int rate_max = 1; - /* If new buckets are added, set their idle spawn rate to - * the highest so far, so that they get filled as quickly - * as the existing ones. + /* We always spawn/kill children in a multiple of num_buckets (as needed), + * so align (round up) max_spawn_rate and idle_spawn_rate to num_buckets. */ - for (i = 0; i < retained->mpm->num_buckets; i++) { - if (rate_max < retained->idle_spawn_rate[i]) { - rate_max = retained->idle_spawn_rate[i]; - } + retained->max_spawn_rate = (((MAX_SPAWN_RATE + num_buckets - 1) + / num_buckets) * num_buckets); + retained->idle_spawn_rate = (((retained->idle_spawn_rate + num_buckets - 1) + / num_buckets) * num_buckets); + if (retained->idle_spawn_rate < num_buckets) { + retained->idle_spawn_rate = num_buckets; } - for (/* up to date i */; i < num_buckets; i++) { - retained->idle_spawn_rate[i] = rate_max; + else if (retained->idle_spawn_rate > retained->max_spawn_rate) { + retained->idle_spawn_rate = retained->max_spawn_rate; } + retained->mpm->num_buckets = num_buckets; } - retained->mpm->num_buckets = num_buckets; /* Don't thrash since num_buckets depends on the * system and the number of online CPU cores... @@ -3728,11 +3729,6 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s) if (max_spare_threads < min_spare_threads + (threads_per_child + 1) * num_buckets) max_spare_threads = min_spare_threads + (threads_per_child + 1) * num_buckets; - max_spawn_rate_per_bucket = (MAX_SPAWN_RATE + num_buckets - 1) / num_buckets; - if (max_spawn_rate_per_bucket < 1) { - max_spawn_rate_per_bucket = 1; - } - /* If we're doing a graceful_restart then we're going to see a lot * of children exiting immediately when we get into the main loop * below (because we just sent them AP_SIG_GRACEFUL). This happens pretty From b85d7387f59eff51261744997abf81b3cc7c9d55 Mon Sep 17 00:00:00 2001 From: ylavic Date: Mon, 3 Jun 2024 15:35:47 +0200 Subject: [PATCH 03/22] mpm_event: Add helpers, simplify code and improve logging before functional changes. --- modules/http/http_request.c | 3 +- server/mpm/event/event.c | 1338 +++++++++++++++++++-------------- server/mpm/motorz/motorz.c | 14 +- server/mpm/simple/simple_io.c | 10 +- 4 files changed, 787 insertions(+), 578 deletions(-) diff --git a/modules/http/http_request.c b/modules/http/http_request.c index 77bf63edc5f..c8f157eca80 100644 --- a/modules/http/http_request.c +++ b/modules/http/http_request.c @@ -431,7 +431,8 @@ void ap_process_async_request(request_rec *r) const apr_array_header_t *t_h = apr_table_elts(r->headers_in); const apr_table_entry_t *t_elt = (apr_table_entry_t *)t_h->elts; ap_log_rerror(APLOG_MARK, APLOG_TRACE4, 0, r, - "Headers received from client:"); + "Header received from client:"); + ap_log_rerror(APLOG_MARK, APLOG_TRACE4, 0, r, " %s", r->the_request); for (i = 0; i < t_h->nelts; i++, t_elt++) { ap_log_rerror(APLOG_MARK, APLOG_TRACE4, 0, r, " %s: %s", ap_escape_logitem(r->pool, t_elt->key), diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 29a7b2bd345..64ff1e30ead 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -149,7 +149,7 @@ #define MPM_CHILD_PID(i) (ap_scoreboard_image->parent[i].pid) #if !APR_VERSION_AT_LEAST(1,4,0) -#define apr_time_from_msec(x) (x * 1000) +#define apr_time_from_msec(x) ((x) * 1000) #endif #define CONN_STATE_IS_LINGERING_CLOSE(s) ((s) >= CONN_STATE_LINGER && \ @@ -159,6 +159,21 @@ #endif #define SECONDS_TO_LINGER 2 +/* Don't wait more time in poll() if APR_POLLSET_WAKEABLE is not implemented */ +#define NON_WAKEABLE_TIMEOUT apr_time_from_msec(100) + +/* Prevent extra poll/wakeup calls for timeouts close in the future (queues + * have the granularity of a second anyway). + * XXX: Wouldn't 0.5s (instead of 0.1s) be "enough"? + */ +#define QUEUES_FUDGE_TIMEOUT apr_time_from_msec(100) + +/* Same goal as for QUEUES_FUDGE_TIMEOUT, but applied to timers. + * XXX: Since their timeouts are custom (user defined), we can't be too + * approximative here (using 5ms). + */ +#define TIMERS_FUDGE_TIMEOUT apr_time_from_msec(5) + /* * Actual definitions of config globals */ @@ -178,26 +193,27 @@ static int active_daemons_limit = 0; /* MaxRequestWorkers / ThreadsPerChi static int max_workers = 0; /* MaxRequestWorkers */ static int server_limit = 0; /* ServerLimit */ static int thread_limit = 0; /* ThreadLimit */ -static int had_healthy_child = 0; +static int conns_this_child = 0; /* MaxConnectionsPerChild, only accessed + in listener thread */ static volatile int dying = 0; static volatile int workers_may_exit = 0; static volatile int start_thread_may_exit = 0; static volatile int listener_may_exit = 0; -static int listener_is_wakeable = 0; /* Pollset supports APR_POLLSET_WAKEABLE */ -static int num_listensocks = 0; -static apr_int32_t conns_this_child; /* MaxConnectionsPerChild, only access - in listener thread */ static apr_uint32_t connection_count = 0; /* Number of open connections */ static apr_uint32_t lingering_count = 0; /* Number of connections in lingering close */ static apr_uint32_t suspended_count = 0; /* Number of suspended connections */ static apr_uint32_t clogged_count = 0; /* Number of threads processing ssl conns */ static apr_uint32_t threads_shutdown = 0; /* Number of threads that have shutdown early during graceful termination */ +static int had_healthy_child = 0; static int resource_shortage = 0; + static fd_queue_t *worker_queue; static fd_queue_info_t *worker_queue_info; -static apr_thread_mutex_t *timeout_mutex; +static int num_listensocks = 0; +static int listener_is_wakeable = 0; /* Pollset supports APR_POLLSET_WAKEABLE */ +static apr_pollfd_t *listener_pollfd; module AP_MODULE_DECLARE_DATA mpm_event_module; @@ -205,7 +221,8 @@ module AP_MODULE_DECLARE_DATA mpm_event_module; struct event_srv_cfg_s; typedef struct event_srv_cfg_s event_srv_cfg; -static apr_pollfd_t *listener_pollfd; +struct timeout_queue; +static apr_thread_mutex_t *timeout_mutex; /* * The pollset for sockets that are in any of the timeout queues. Currently @@ -219,18 +236,13 @@ static apr_pollfd_t *listener_pollfd; static apr_pollset_t *event_pollset; typedef struct event_conn_state_t event_conn_state_t; - -/* - * The chain of connections to be shutdown by a worker thread (deferred), - * linked list updated atomically. - */ -static event_conn_state_t *volatile defer_linger_chain; - struct event_conn_state_t { /** APR_RING of expiration timeouts */ APR_RING_ENTRY(event_conn_state_t) timeout_list; - /** the time when the entry was queued */ - apr_time_t queue_timestamp; + /** public parts of the connection state */ + conn_state_t pub; + /** memory pool allocated on and to allocate from (ptrans) */ + apr_pool_t *p; /** connection record this struct refers to */ conn_rec *c; /** request record (if any) this struct refers to */ @@ -239,37 +251,101 @@ struct event_conn_state_t { event_srv_cfg *sc; /** scoreboard handle for the conn_rec */ ap_sb_handle_t *sbh; - /** is the current conn_rec suspended? (disassociated with - * a particular MPM thread; for suspend_/resume_connection - * hooks) - */ - int suspended; - /** memory pool to allocate from */ - apr_pool_t *p; /** bucket allocator */ apr_bucket_alloc_t *bucket_alloc; + + /* + * when queued to the listener + */ /** poll file descriptor information */ apr_pollfd_t pfd; - /** public parts of the connection state */ - conn_state_t pub; + /** the time when the entry was queued */ + apr_time_t queue_timestamp; + /** the timeout queue for this entry */ + struct timeout_queue *q; + + /* + * when queued to workers + */ /** chaining in defer_linger_chain */ struct event_conn_state_t *chain; - unsigned int + + /* + * bools as bits + */ + unsigned int + /** Is the current conn_rec suspended? (disassociated with + * a particular MPM thread; for suspend_/resume_connection + * hooks) + */ + suspended :1, /** Is lingering close from defer_lingering_close()? */ deferred_linger :1, /** Has ap_start_lingering_close() been called? */ linger_started :1; }; -APR_RING_HEAD(timeout_head_t, event_conn_state_t); +static APR_INLINE apr_socket_t *cs_sd(event_conn_state_t *cs) +{ + ap_assert(cs != NULL); + return cs->pfd.desc.s; +} +static APR_INLINE int cs_fd(event_conn_state_t *cs) +{ + apr_os_sock_t fd = -1; + apr_os_sock_get(&fd, cs_sd(cs)); + return fd; +} +static APR_INLINE apr_sockaddr_t *cs_raddr(event_conn_state_t *cs) +{ + apr_sockaddr_t *addr = NULL; + apr_socket_addr_get(&addr, APR_REMOTE, cs_sd(cs)); + return addr; +} +static APR_INLINE const char *cs_state_str(event_conn_state_t *cs) +{ + switch (cs->pub.state) { + case CONN_STATE_PROCESSING: + return "STATE_PROCESSING"; + case CONN_STATE_HANDLER: + return "STATE_HANDLER"; + case CONN_STATE_ASYNC_WAITIO: + return "STATE_ASYNC_WAITIO"; + case CONN_STATE_WRITE_COMPLETION: + return "STATE_WRITE_COMPLETION"; + case CONN_STATE_KEEPALIVE: + return "STATE_KEEPALIVE"; + case CONN_STATE_LINGER: + case CONN_STATE_LINGER_NORMAL: + case CONN_STATE_LINGER_SHORT: + return "STATE_LINGER"; + case CONN_STATE_SUSPENDED: + return "STATE_SUSPENDED"; + default: + return "STATE_UNKNOWN"; + } +} +#define CS_FMT "pp:%s:%i" +#define CS_ARG(cs) (cs), cs_state_str(cs), cs_fd(cs) +#define CS_FMT_TO CS_FMT " to [%pI]" +#define CS_ARG_TO(cs) CS_ARG(cs), cs_raddr(cs) +/* + * The chain of connections to be shutdown by a worker thread (deferred), + * linked list updated atomically. + */ +static event_conn_state_t *volatile defer_linger_chain; + +APR_RING_HEAD(timeout_head_t, event_conn_state_t); struct timeout_queue { struct timeout_head_t head; apr_interval_time_t timeout; apr_uint32_t count; /* for this queue */ apr_uint32_t *total; /* for all chained/related queues */ + const char *name; /* for logging */ struct timeout_queue *next; /* chaining */ }; + /* * Several timeout queues that use different timeouts, so that we always can * simply append to the end. @@ -279,39 +355,38 @@ struct timeout_queue { * linger_q uses MAX_SECS_TO_LINGER * short_linger_q uses SECONDS_TO_LINGER */ -static struct timeout_queue *waitio_q, - *write_completion_q, - *keepalive_q, - *linger_q, - *short_linger_q; -static volatile apr_time_t queues_next_expiry; +static struct timeout_queue *waitio_q, /* wait for I/O to happen */ + *write_completion_q, /* completion or user async poll */ + *keepalive_q, /* in between requests */ + *linger_q, /* lingering (read) before close */ + *short_linger_q; /* lingering (read) before close (short timeout) */ -/* Prevent extra poll/wakeup calls for timeouts close in the future (queues - * have the granularity of a second anyway). - * XXX: Wouldn't 0.5s (instead of 0.1s) be "enough"? - */ -#define TIMEOUT_FUDGE_FACTOR apr_time_from_msec(100) +static volatile apr_time_t queues_next_expiry; /* next expiry time accross all queues */ /* * Macros for accessing struct timeout_queue. * For TO_QUEUE_APPEND and TO_QUEUE_REMOVE, timeout_mutex must be held. */ -static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *el) +static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *cs) { apr_time_t elem_expiry; apr_time_t next_expiry; - APR_RING_INSERT_TAIL(&q->head, el, event_conn_state_t, timeout_list); + ap_assert(q && !cs->q); + + cs->q = q; + cs->queue_timestamp = apr_time_now(); + APR_RING_INSERT_TAIL(&q->head, cs, event_conn_state_t, timeout_list); ++*q->total; ++q->count; /* Cheaply update the global queues_next_expiry with the one of the * first entry of this queue (oldest) if it expires before. */ - el = APR_RING_FIRST(&q->head); - elem_expiry = el->queue_timestamp + q->timeout; + cs = APR_RING_FIRST(&q->head); + elem_expiry = cs->queue_timestamp + q->timeout; next_expiry = queues_next_expiry; - if (!next_expiry || next_expiry > elem_expiry + TIMEOUT_FUDGE_FACTOR) { + if (!next_expiry || next_expiry > elem_expiry + QUEUES_FUDGE_TIMEOUT) { queues_next_expiry = elem_expiry; /* Unblock the poll()ing listener for it to update its timeout. */ if (listener_is_wakeable) { @@ -320,29 +395,51 @@ static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *el) } } -static void TO_QUEUE_REMOVE(struct timeout_queue *q, event_conn_state_t *el) +static void TO_QUEUE_REMOVE(struct timeout_queue *q, event_conn_state_t *cs) { - APR_RING_REMOVE(el, timeout_list); - APR_RING_ELEM_INIT(el, timeout_list); + ap_assert(q && cs->q == q); + cs->q = NULL; + + APR_RING_REMOVE(cs, timeout_list); + APR_RING_ELEM_INIT(cs, timeout_list); --*q->total; --q->count; } -static struct timeout_queue *TO_QUEUE_MAKE(apr_pool_t *p, apr_time_t t, +static struct timeout_queue *TO_QUEUE_MAKE(apr_pool_t *p, + const char *name, + apr_interval_time_t t, struct timeout_queue *ref) { struct timeout_queue *q; - + q = apr_pcalloc(p, sizeof *q); APR_RING_INIT(&q->head, event_conn_state_t, timeout_list); q->total = (ref) ? ref->total : apr_pcalloc(p, sizeof *q->total); q->timeout = t; + q->name = name; return q; } -#define TO_QUEUE_ELEM_INIT(el) \ - APR_RING_ELEM_INIT((el), timeout_list) +static struct timeout_queue *TO_QUEUE_CHAIN(apr_pool_t *p, + const char *name, + apr_interval_time_t t, + struct timeout_queue **ref, + apr_hash_t *ht, apr_pool_t *hp) +{ + struct timeout_queue *q = apr_hash_get(ht, &t, sizeof t); + + if (!q) { + q = TO_QUEUE_MAKE(p, name, t, *ref); + q->next = *ref; + *ref = q; + + apr_hash_set(ht, apr_pmemdup(hp, &t, sizeof t), sizeof t, q); + } + + return q; +} #if HAVE_SERF typedef struct { @@ -454,6 +551,7 @@ static event_retained_data *retained; #endif struct event_srv_cfg_s { + /* Per server timeout queues */ struct timeout_queue *io_q, *wc_q, *ka_q; @@ -512,37 +610,59 @@ static void disable_listensocks(void) if (apr_atomic_cas32(&listensocks_disabled, 1, 0) != 0) { return; } - if (event_pollset) { - for (i = 0; i < num_listensocks; i++) { - apr_pollset_remove(event_pollset, &listener_pollfd[i]); - } - } + + ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(10381) + "Suspend listening sockets: idlers:%i conns:%u " + "waitio:%u write:%u keepalive:%u linger:%u/%u " + "suspended:%u clogged:%u", + ap_queue_info_num_idlers(worker_queue_info), + apr_atomic_read32(&connection_count), + apr_atomic_read32(waitio_q->total), + apr_atomic_read32(write_completion_q->total), + apr_atomic_read32(keepalive_q->total), + apr_atomic_read32(linger_q->total), + apr_atomic_read32(short_linger_q->total), + apr_atomic_read32(&suspended_count), + apr_atomic_read32(&clogged_count)); + ap_scoreboard_image->parent[ap_child_slot].not_accepting = 1; + + for (i = 0; i < num_listensocks; i++) { + apr_pollset_remove(event_pollset, &listener_pollfd[i]); + } } static void enable_listensocks(void) { int i; if (listener_may_exit - || apr_atomic_cas32(&listensocks_disabled, 0, 1) != 1) { + || apr_atomic_cas32(&listensocks_disabled, 0, 1) != 1) { return; } - ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, APLOGNO(00457) - "Accepting new connections again: " - "%u active conns (%u lingering/%u clogged/%u suspended), " - "%u idle workers", + + ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(00457) + "Resume listening sockets: idlers:%i conns:%u " + "waitio:%u write:%u keepalive:%u linger:%u/%u " + "suspended:%u clogged:%u", + ap_queue_info_num_idlers(worker_queue_info), apr_atomic_read32(&connection_count), - apr_atomic_read32(&lingering_count), - apr_atomic_read32(&clogged_count), + apr_atomic_read32(waitio_q->total), + apr_atomic_read32(write_completion_q->total), + apr_atomic_read32(keepalive_q->total), + apr_atomic_read32(linger_q->total), + apr_atomic_read32(short_linger_q->total), apr_atomic_read32(&suspended_count), - ap_queue_info_num_idlers(worker_queue_info)); - for (i = 0; i < num_listensocks; i++) - apr_pollset_add(event_pollset, &listener_pollfd[i]); + apr_atomic_read32(&clogged_count)); + /* * XXX: This is not yet optimal. If many workers suddenly become available, * XXX: the parent may kill some processes off too soon. */ ap_scoreboard_image->parent[ap_child_slot].not_accepting = 0; + + for (i = 0; i < num_listensocks; i++) { + apr_pollset_add(event_pollset, &listener_pollfd[i]); + } } static APR_INLINE apr_uint32_t listeners_disabled(void) @@ -575,21 +695,23 @@ static APR_INLINE int should_enable_listensocks(void) return !dying && listeners_disabled() && !connections_above_limit(NULL); } -static void close_socket_nonblocking_(apr_socket_t *csd, - const char *from, int line) +static void close_socket_at(apr_socket_t *csd, + const char *at, int line) { - apr_status_t rv; apr_os_sock_t fd = -1; + apr_status_t rv = apr_os_sock_get(&fd, csd); /* close_worker_sockets() may have closed it already */ - rv = apr_os_sock_get(&fd, csd); - ap_log_error(APLOG_MARK, APLOG_TRACE8, 0, ap_server_conf, - "closing socket %i/%pp from %s:%i", (int)fd, csd, from, line); if (rv == APR_SUCCESS && fd == -1) { + ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf, + "dead socket %pp at %s:%i", csd, at, line); return; } - apr_socket_timeout_set(csd, 0); + ap_log_error(APLOG_MARK, APLOG_TRACE7, rv, ap_server_conf, + "closing socket %pp:%i at %s:%i", csd, (int)fd, at, line); + + apr_socket_opt_set(csd, APR_SO_NONBLOCK, 1); rv = apr_socket_close(csd); if (rv != APR_SUCCESS) { ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(00468) @@ -597,8 +719,8 @@ static void close_socket_nonblocking_(apr_socket_t *csd, AP_DEBUG_ASSERT(0); } } -#define close_socket_nonblocking(csd) \ - close_socket_nonblocking_(csd, __FUNCTION__, __LINE__) +#define close_socket(csd) \ + close_socket_at(csd, __FUNCTION__, __LINE__) static void close_worker_sockets(void) { @@ -607,15 +729,16 @@ static void close_worker_sockets(void) apr_socket_t *csd = worker_sockets[i]; if (csd) { worker_sockets[i] = NULL; - close_socket_nonblocking(csd); + close_socket(csd); } } } -static void wakeup_listener(void) +static void shutdown_listener(void) { ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, - "wake up listener%s", listener_may_exit ? " again" : ""); + "shutting down listener%s", + listener_may_exit ? " again" : ""); listener_may_exit = 1; disable_listensocks(); @@ -667,7 +790,7 @@ static void signal_threads(int mode) /* in case we weren't called from the listener thread, wake up the * listener thread */ - wakeup_listener(); + shutdown_listener(); /* for ungraceful termination, let the workers exit now; * for graceful termination, the listener thread will notify the @@ -841,8 +964,10 @@ static apr_status_t decrement_connection_count(void *cs_) { int is_last_connection; event_conn_state_t *cs = cs_; - ap_log_cerror(APLOG_MARK, APLOG_TRACE8, 0, cs->c, - "cleanup connection from state %i", (int)cs->pub.state); + ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf, + "connection %" CS_FMT_TO " cleaned up", + CS_ARG_TO(cs)); + switch (cs->pub.state) { case CONN_STATE_LINGER: case CONN_STATE_LINGER_NORMAL: @@ -861,8 +986,8 @@ static apr_status_t decrement_connection_count(void *cs_) */ is_last_connection = !apr_atomic_dec32(&connection_count); if (listener_is_wakeable - && ((is_last_connection && listener_may_exit) - || should_enable_listensocks())) { + && ((is_last_connection && listener_may_exit) + || should_enable_listensocks())) { apr_pollset_wakeup(event_pollset); } if (dying) { @@ -895,7 +1020,7 @@ static void notify_resume(event_conn_state_t *cs, int cleanup) static int defer_lingering_close(event_conn_state_t *cs) { ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, - "deferring close from state %i", (int)cs->pub.state); + "deferring close for connection %" CS_FMT, CS_ARG(cs)); /* The connection is not shutdown() yet strictly speaking, but it's not * in any queue nor handled by a worker either (will be very soon), so @@ -922,14 +1047,28 @@ static int defer_lingering_close(event_conn_state_t *cs) * Pre-condition: nonblocking, can be called from anywhere provided cs is not * in any timeout queue or in the pollset. */ -static void close_connection(event_conn_state_t *cs) +static void close_connection_at(event_conn_state_t *cs, + const char *at, int line) { - ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, - "closing connection from state %i", (int)cs->pub.state); + if (cs->c) { + ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, + "closing connection %" CS_FMT " at %s:%i", + CS_ARG(cs), at, line); + } + else { + ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf, + "closing connection %" CS_FMT_TO " at %s:%i", + CS_ARG_TO(cs), at, line); + } - close_socket_nonblocking(cs->pfd.desc.s); + close_socket_at(cs_sd(cs), at, line); ap_queue_info_push_pool(worker_queue_info, cs->p); } +#define close_connection(cs) \ + close_connection_at((cs), __FUNCTION__, __LINE__) + +/* forward declare */ +static void set_conn_state_sense(event_conn_state_t *cs, int sense); /* Shutdown the connection in case of timeout, error or resources shortage. * This starts short lingering close if not already there, or directly closes @@ -1015,11 +1154,145 @@ static int event_post_read_request(request_rec *r) return OK; } +static int pollset_add_at(event_conn_state_t *cs, int sense, + struct timeout_queue *q, + const char *at, int line) +{ + apr_status_t rv; + + ap_log_cerror(APLOG_MARK, APLOG_TRACE7, 0, cs->c, + "pollset: add %s=%" APR_TIME_T_FMT " events=%x" + " for connection %" CS_FMT " at %s:%i", + (q) ? "q" : "t", + (q) ? q->timeout : -1, + (int)cs->pfd.reqevents, + CS_ARG(cs), at, line); + + ap_assert(cs->q == NULL && q != NULL); + + set_conn_state_sense(cs, sense); + + if (q) { + apr_thread_mutex_lock(timeout_mutex); + TO_QUEUE_APPEND(q, cs); + } + rv = apr_pollset_add(event_pollset, &cs->pfd); + if (rv != APR_SUCCESS) { + if (q) { + TO_QUEUE_REMOVE(q, cs); + apr_thread_mutex_unlock(timeout_mutex); + } + + /* close_worker_sockets() may have closed it already */ + if (workers_may_exit) { + AP_DEBUG_ASSERT(APR_STATUS_IS_EBADF(rv)); + } + else { + ap_log_cerror(APLOG_MARK, APLOG_ERR, rv, cs->c, APLOGNO(03093) + "pollset add failed for connection %" CS_FMT " at %s:%i", + CS_ARG(cs), at, line); + AP_DEBUG_ASSERT(0); + signal_threads(ST_GRACEFUL); + } + return 0; + } + if (q) { + apr_thread_mutex_unlock(timeout_mutex); + } + return 1; +} +#define pollset_add(cs, sense, q) \ + pollset_add_at((cs), (sense), (q), __FUNCTION__, __LINE__) + +static int pollset_del_at(event_conn_state_t *cs, int locked, + const char *at, int line) +{ + apr_status_t rv; + + ap_log_cerror(APLOG_MARK, APLOG_TRACE7, 0, cs->c, + "pollset: del %s=%" APR_TIME_T_FMT " events=%x" + " for connection %" CS_FMT " at %s:%i", + (cs->q) ? "q" : "t", + (cs->q) ? cs->q->timeout : -1, + (int)cs->pfd.reqevents, + CS_ARG(cs), at, line); + + ap_assert(cs->q != NULL); + + if (cs->q) { + if (!locked) { + apr_thread_mutex_lock(timeout_mutex); + } + TO_QUEUE_REMOVE(cs->q, cs); + if (!locked) { + apr_thread_mutex_unlock(timeout_mutex); + } + } + + /* + * Some of the pollset backends, like KQueue or Epoll + * automagically remove the FD if the socket is closed, + * therefore, we can accept _SUCCESS or _NOTFOUND, + * and we still want to keep going + */ + rv = apr_pollset_remove(event_pollset, &cs->pfd); + if (rv != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rv)) { + ap_log_cerror(APLOG_MARK, APLOG_ERR, rv, cs->c, APLOGNO(03094) + "pollset remove failed for connection %" CS_FMT " at %s:%i", + CS_ARG(cs), at, line); + AP_DEBUG_ASSERT(0); + signal_threads(ST_GRACEFUL); + return 0; + } + + return 1; +} +#define pollset_del(cs, locked) \ + pollset_del_at((cs), (locked), __FUNCTION__, __LINE__) + /* Forward declare */ static void process_lingering_close(event_conn_state_t *cs); -static void update_reqevents_from_sense(event_conn_state_t *cs, - int default_sense) +static event_conn_state_t *make_conn_state(apr_pool_t *p, apr_socket_t *csd) +{ + event_conn_state_t *cs = apr_pcalloc(p, sizeof(*cs)); + listener_poll_type *pt; + + cs->p = p; + cs->pfd.desc.s = csd; + cs->pfd.desc_type = APR_POLL_SOCKET; + cs->pfd.client_data = pt = apr_pcalloc(p, sizeof(*pt)); + pt->type = PT_CSD; + pt->baton = cs; + + APR_RING_ELEM_INIT(cs, timeout_list); + + cs->sc = ap_get_module_config(ap_server_conf->module_config, + &mpm_event_module); + + /** + * XXX If the platform does not have a usable way of bundling + * accept() with a socket readability check, like Win32, + * and there are measurable delays before the + * socket is readable due to the first data packet arriving, + * it might be better to create the cs on the listener thread + * with the state set to CONN_STATE_KEEPALIVE + * + * FreeBSD users will want to enable the HTTP accept filter + * module in their kernel for the highest performance + * When the accept filter is active, sockets are kept in the + * kernel until a HTTP request is received. + */ + cs->pub.state = CONN_STATE_PROCESSING; + cs->pub.sense = CONN_SENSE_DEFAULT; + + apr_atomic_inc32(&connection_count); + apr_pool_cleanup_register(p, cs, decrement_connection_count, + apr_pool_cleanup_null); + return cs; +} + +static void set_conn_state_sense(event_conn_state_t *cs, int default_sense) { int sense = default_sense; @@ -1046,80 +1319,51 @@ static void update_reqevents_from_sense(event_conn_state_t *cs, /* * process one connection in the worker */ -static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * sock, - event_conn_state_t * cs, int my_child_num, - int my_thread_num) +static void process_socket(apr_thread_t *thd, apr_pool_t *p, + apr_socket_t *sock, event_conn_state_t *cs, + int my_child_num, int my_thread_num) { - conn_rec *c; + conn_rec *c = cs->c; long conn_id = ID_FROM_CHILD_THREAD(my_child_num, my_thread_num); - int clogging = 0, from_wc_q = 0; - apr_status_t rv; - int rc = OK; + int rc = OK, processed = 0, clogging; - if (cs == NULL) { /* This is a new connection */ - listener_poll_type *pt = apr_pcalloc(p, sizeof(*pt)); - cs = apr_pcalloc(p, sizeof(event_conn_state_t)); + if (!c) { /* This is a new connection */ cs->bucket_alloc = apr_bucket_alloc_create(p); ap_create_sb_handle(&cs->sbh, p, my_child_num, my_thread_num); - c = ap_run_create_connection(p, ap_server_conf, sock, - conn_id, cs->sbh, cs->bucket_alloc); + cs->c = c = ap_run_create_connection(p, ap_server_conf, sock, conn_id, + cs->sbh, cs->bucket_alloc); if (!c) { ap_queue_info_push_pool(worker_queue_info, p); return; } - apr_atomic_inc32(&connection_count); - apr_pool_cleanup_register(c->pool, cs, decrement_connection_count, - apr_pool_cleanup_null); + apr_pool_pre_cleanup_register(p, cs, ptrans_pre_cleanup); ap_set_module_config(c->conn_config, &mpm_event_module, cs); c->current_thread = thd; c->cs = &cs->pub; - cs->c = c; - cs->p = p; - cs->sc = ap_get_module_config(ap_server_conf->module_config, - &mpm_event_module); - cs->pfd.desc_type = APR_POLL_SOCKET; - cs->pfd.desc.s = sock; - pt->type = PT_CSD; - pt->baton = cs; - cs->pfd.client_data = pt; - apr_pool_pre_cleanup_register(p, cs, ptrans_pre_cleanup); - TO_QUEUE_ELEM_INIT(cs); ap_update_vhost_given_ip(c); - rc = ap_pre_connection(c, sock); if (rc != OK && rc != DONE) { ap_log_cerror(APLOG_MARK, APLOG_DEBUG, 0, c, APLOGNO(00469) - "process_socket: connection aborted"); + "process_socket: connection aborted (%d)", rc); close_connection(cs); return; } - /** - * XXX If the platform does not have a usable way of bundling - * accept() with a socket readability check, like Win32, - * and there are measurable delays before the - * socket is readable due to the first data packet arriving, - * it might be better to create the cs on the listener thread - * with the state set to CONN_STATE_KEEPALIVE - * - * FreeBSD users will want to enable the HTTP accept filter - * module in their kernel for the highest performance - * When the accept filter is active, sockets are kept in the - * kernel until a HTTP request is received. - */ - cs->pub.state = CONN_STATE_PROCESSING; cs->pub.sense = CONN_SENSE_DEFAULT; } - else { + else { /* The connection is scheduled back */ c = cs->c; + c->current_thread = thd; + c->id = conn_id; /* thread number is part of ID */ ap_update_sb_handle(cs->sbh, my_child_num, my_thread_num); notify_resume(cs, 0); - c->current_thread = thd; - /* Subsequent request on a conn, and thread number is part of ID */ - c->id = conn_id; } + ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, + "processing connection %" CS_FMT " (aborted %d, clogging %d)", + CS_ARG(cs), c->aborted, c->clogging_input_filters); + if (CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) { goto lingering_close; } @@ -1133,8 +1377,8 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc */ || c->clogging_input_filters) { process_connection: + processed = 1; cs->pub.state = CONN_STATE_PROCESSING; - clogging = c->clogging_input_filters; if (clogging) { apr_atomic_inc32(&clogged_count); @@ -1197,40 +1441,24 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc goto lingering_close; } } - else if (cs->pub.state == CONN_STATE_WRITE_COMPLETION) { - from_wc_q = 1; - } if (cs->pub.state == CONN_STATE_ASYNC_WAITIO) { /* Set a read/write timeout for this connection, and let the * event thread poll for read/writeability. */ - cs->queue_timestamp = apr_time_now(); - notify_suspend(cs); - ap_update_child_status(cs->sbh, SERVER_BUSY_READ, NULL); + notify_suspend(cs); /* Modules might set c->cs->sense to CONN_SENSE_WANT_WRITE, * the default is CONN_SENSE_WANT_READ still. */ - update_reqevents_from_sense(cs, CONN_SENSE_WANT_READ); - apr_thread_mutex_lock(timeout_mutex); - TO_QUEUE_APPEND(cs->sc->io_q, cs); - rv = apr_pollset_add(event_pollset, &cs->pfd); - if (rv != APR_SUCCESS && !APR_STATUS_IS_EEXIST(rv)) { - AP_DEBUG_ASSERT(0); - TO_QUEUE_REMOVE(cs->sc->io_q, cs); - apr_thread_mutex_unlock(timeout_mutex); - ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(10503) - "process_socket: apr_pollset_add failure in " - "CONN_STATE_ASYNC_WAITIO"); - close_connection(cs); - signal_threads(ST_GRACEFUL); - } - else { - apr_thread_mutex_unlock(timeout_mutex); + if (pollset_add(cs, CONN_SENSE_WANT_READ, cs->sc->io_q)) { + apr_table_setn(cs->c->notes, "short-lingering-close", "1"); + cs->pub.state = CONN_STATE_LINGER; + goto lingering_close; } - return; + + return; /* queued */ } if (cs->pub.state == CONN_STATE_WRITE_COMPLETION) { @@ -1239,11 +1467,9 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc /* Flush all pending outputs before going to CONN_STATE_KEEPALIVE or * straight to CONN_STATE_PROCESSING if inputs are pending already. */ - ap_update_child_status(cs->sbh, SERVER_BUSY_WRITE, NULL); - if (from_wc_q) { - from_wc_q = 0; /* one shot */ + if (!processed) { pending = ap_check_output_pending(c); } else if (ap_filter_should_yield(c->output_filters)) { @@ -1251,38 +1477,24 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc } if (pending == AGAIN) { /* Let the event thread poll for write */ - cs->queue_timestamp = apr_time_now(); notify_suspend(cs); - - /* Add work to pollset. */ cs->pub.sense = CONN_SENSE_DEFAULT; - update_reqevents_from_sense(cs, CONN_SENSE_WANT_WRITE); - apr_thread_mutex_lock(timeout_mutex); - TO_QUEUE_APPEND(cs->sc->wc_q, cs); - rv = apr_pollset_add(event_pollset, &cs->pfd); - if (rv != APR_SUCCESS && !APR_STATUS_IS_EEXIST(rv)) { - AP_DEBUG_ASSERT(0); - TO_QUEUE_REMOVE(cs->sc->wc_q, cs); - apr_thread_mutex_unlock(timeout_mutex); - ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03465) - "process_socket: apr_pollset_add failure in " - "CONN_STATE_WRITE_COMPLETION"); - close_connection(cs); - signal_threads(ST_GRACEFUL); - } - else { - apr_thread_mutex_unlock(timeout_mutex); + if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q)) { + return; /* queued */ } - return; + /* Fall through lingering close */ + apr_table_setn(cs->c->notes, "short-lingering-close", "1"); } - if (pending != OK || c->aborted || c->keepalive != AP_CONN_KEEPALIVE) { - cs->pub.state = CONN_STATE_LINGER; - goto lingering_close; - } - if (ap_check_input_pending(c) == AGAIN) { - goto process_connection; + else if (pending == OK) { + /* Some data to process immediately? */ + pending = (c->keepalive == AP_CONN_KEEPALIVE + ? ap_check_input_pending(c) + : DONE); + if (pending == AGAIN) { + goto process_connection; + } } - if (listener_may_exit) { + if (pending != OK || listener_may_exit) { cs->pub.state = CONN_STATE_LINGER; goto lingering_close; } @@ -1302,40 +1514,25 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc * timeout today. With a normal client, the socket will be readable in * a few milliseconds anyway. */ - cs->queue_timestamp = apr_time_now(); notify_suspend(cs); - /* Add work to pollset. */ - cs->pub.sense = CONN_SENSE_DEFAULT; - update_reqevents_from_sense(cs, CONN_SENSE_WANT_READ); - apr_thread_mutex_lock(timeout_mutex); - TO_QUEUE_APPEND(cs->sc->ka_q, cs); - rv = apr_pollset_add(event_pollset, &cs->pfd); - if (rv != APR_SUCCESS && !APR_STATUS_IS_EEXIST(rv)) { - AP_DEBUG_ASSERT(0); - TO_QUEUE_REMOVE(cs->sc->ka_q, cs); - apr_thread_mutex_unlock(timeout_mutex); - ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03093) - "process_socket: apr_pollset_add failure for " - "keep alive"); - close_connection(cs); - signal_threads(ST_GRACEFUL); - } - else { - apr_thread_mutex_unlock(timeout_mutex); + if (!pollset_add(cs, CONN_SENSE_WANT_READ, cs->sc->ka_q)) { + apr_table_setn(cs->c->notes, "short-lingering-close", "1"); + cs->pub.state = CONN_STATE_LINGER; + goto lingering_close; } - return; + + return; /* queued */ } if (cs->pub.state == CONN_STATE_SUSPENDED) { cs->c->suspended_baton = cs; apr_atomic_inc32(&suspended_count); notify_suspend(cs); - return; + return; /* done */ } lingering_close: - /* CONN_STATE_LINGER[_*] fall through process_lingering_close() */ process_lingering_close(cs); } @@ -1347,31 +1544,29 @@ static apr_status_t event_resume_suspended (conn_rec *c) ap_log_cerror (APLOG_MARK, LOG_WARNING, 0, c, APLOGNO(02615) "event_resume_suspended: suspended_baton is NULL"); return APR_EGENERAL; - } else if (!cs->suspended) { + } + if (!cs->suspended) { ap_log_cerror (APLOG_MARK, LOG_WARNING, 0, c, APLOGNO(02616) "event_resume_suspended: Thread isn't suspended"); return APR_EGENERAL; } + apr_atomic_dec32(&suspended_count); c->suspended_baton = NULL; + cs->pub.sense = CONN_SENSE_DEFAULT; if (!CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) { - cs->queue_timestamp = apr_time_now(); - notify_suspend(cs); - - cs->pub.sense = CONN_SENSE_DEFAULT; cs->pub.state = CONN_STATE_WRITE_COMPLETION; - update_reqevents_from_sense(cs, CONN_SENSE_WANT_WRITE); - apr_thread_mutex_lock(timeout_mutex); - TO_QUEUE_APPEND(cs->sc->wc_q, cs); - apr_pollset_add(event_pollset, &cs->pfd); - apr_thread_mutex_unlock(timeout_mutex); - } - else { - process_lingering_close(cs); - } + if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q)) { + return APR_SUCCESS; /* queued */ + } - return OK; + /* fall through lingering close on error */ + apr_table_setn(cs->c->notes, "short-lingering-close", "1"); + } + cs->pub.state = CONN_STATE_LINGER; + process_lingering_close(cs); + return APR_SUCCESS; } /* conns_this_child has gone to zero or below. See if the admin coded @@ -1388,36 +1583,31 @@ static void check_infinite_requests(void) conns_this_child = APR_INT32_MAX; } -static int close_listeners(int *closed) +static void set_child_dying(void) { - if (!*closed) { - int i; + ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, "quiescing"); - ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, - "closing listeners (connection_count=%u)", - apr_atomic_read32(&connection_count)); - ap_close_listeners_ex(my_bucket->listeners); + dying = 1; + ap_scoreboard_image->parent[ap_child_slot].quiescing = 1; + ap_close_listeners_ex(my_bucket->listeners); - dying = 1; - ap_scoreboard_image->parent[ap_child_slot].quiescing = 1; +#if 0 + { + int i; for (i = 0; i < threads_per_child; ++i) { ap_update_child_status_from_indexes(ap_child_slot, i, SERVER_GRACEFUL, NULL); } - /* wake up the main thread */ - kill(ap_my_pid, SIGTERM); - - ap_queue_info_free_idle_pools(worker_queue_info); - ap_queue_interrupt_all(worker_queue); - - *closed = 1; /* once */ - return 1; } +#endif - ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf, - "closed listeners (connection_count=%u)", - apr_atomic_read32(&connection_count)); - return 0; + /* wake up idle worker threads */ + ap_queue_interrupt_all(worker_queue); + /* wake up the main thread */ + kill(ap_my_pid, SIGTERM); + + /* No new connections will use the idle pools */ + ap_queue_info_free_idle_pools(worker_queue_info); } static void unblock_signal(int sig) @@ -1500,9 +1690,10 @@ static apr_status_t push2worker(event_conn_state_t *cs, apr_socket_t *csd, apr_status_t rc; if (cs) { - csd = cs->pfd.desc.s; ptrans = cs->p; + csd = cs_sd(cs); } + rc = ap_queue_push_socket(worker_queue, csd, cs, ptrans); if (rc != APR_SUCCESS) { ap_log_error(APLOG_MARK, APLOG_CRIT, rc, ap_server_conf, APLOGNO(00471) @@ -1515,7 +1706,7 @@ static apr_status_t push2worker(event_conn_state_t *cs, apr_socket_t *csd, } else { if (csd) { - close_socket_nonblocking(csd); + close_socket(csd); } if (ptrans) { ap_queue_info_push_pool(worker_queue_info, ptrans); @@ -1572,43 +1763,30 @@ static timer_event_t timer_free_ring; static apr_skiplist *timer_skiplist; static volatile apr_time_t timers_next_expiry; -/* Same goal as for TIMEOUT_FUDGE_FACTOR (avoid extra poll calls), but applied - * to timers. Since their timeouts are custom (user defined), we can't be too - * approximative here (hence using 0.01s). - */ -#define EVENT_FUDGE_FACTOR apr_time_from_msec(10) - -/* The following compare function is used by apr_skiplist_insert() to keep the - * elements (timers) sorted and provide O(log n) complexity (this is also true - * for apr_skiplist_{find,remove}(), but those are not used in MPM event where - * inserted timers are not searched nor removed, but with apr_skiplist_pop() - * which does use any compare function). It is meant to return 0 when a == b, - * <0 when a < b, and >0 when a > b. However apr_skiplist_insert() will not - * add duplicates (i.e. a == b), and apr_skiplist_add() is only available in - * APR 1.6, yet multiple timers could possibly be created in the same micro- - * second (duplicates with regard to apr_time_t); therefore we implement the - * compare function to return +1 instead of 0 when compared timers are equal, - * thus duplicates are still added after each other (in order of insertion). +/* The timer_comp() function is used by apr_skiplist_insert() to keep the + * elements/timers sorted, but it should never return 0 because inserting + * duplicates is not possible (apr_skiplist_add() would allow this but it's + * not available before APR 1.6). Thus duplicates are sorted by order of + * insertion and timers are never equal for the skiplist (not an issue + * because MPM event does not use apr_skiplist_{find,remove}() but + * apr_skiplist_pop() only). */ static int timer_comp(void *a, void *b) { - apr_time_t t1 = (apr_time_t) ((timer_event_t *)a)->when; - apr_time_t t2 = (apr_time_t) ((timer_event_t *)b)->when; - AP_DEBUG_ASSERT(t1); - AP_DEBUG_ASSERT(t2); - return ((t1 < t2) ? -1 : 1); + const timer_event_t *ta = a, *tb = b; + return (ta->when < tb->when) ? -1 : 1; } static apr_thread_mutex_t *g_timer_skiplist_mtx; -static timer_event_t * event_get_timer_event(apr_time_t t, - ap_mpm_callback_fn_t *cbfn, - void *baton, - int insert, - apr_array_header_t *pfds) +static timer_event_t *get_timer_event(apr_time_t timeout, + ap_mpm_callback_fn_t *cbfn, + void *baton, + int insert, + apr_array_header_t *pfds) { timer_event_t *te; - apr_time_t now = (t < 0) ? 0 : apr_time_now(); + apr_time_t now = (timeout < 0) ? 0 : apr_time_now(); /* oh yeah, and make locking smarter/fine grained. */ @@ -1620,16 +1798,16 @@ static timer_event_t * event_get_timer_event(apr_time_t t, } else { te = apr_skiplist_alloc(timer_skiplist, sizeof(timer_event_t)); - APR_RING_ELEM_INIT(te, link); + memset(te, 0, sizeof(*te)); } + APR_RING_ELEM_INIT(te, link); te->cbfunc = cbfn; te->baton = baton; - te->canceled = 0; - te->when = now + t; + te->when = now + timeout; te->pfds = pfds; - if (insert) { + if (insert) { apr_time_t next_expiry; /* Okay, add sorted by when.. */ @@ -1639,33 +1817,51 @@ static timer_event_t * event_get_timer_event(apr_time_t t, * if it expires before. */ next_expiry = timers_next_expiry; - if (!next_expiry || next_expiry > te->when + EVENT_FUDGE_FACTOR) { + if (!next_expiry || next_expiry > te->when + TIMERS_FUDGE_TIMEOUT) { timers_next_expiry = te->when; - /* Unblock the poll()ing listener for it to update its timeout. */ + /* Wake up the listener to eventually update its poll()ing timeout. */ if (listener_is_wakeable) { apr_pollset_wakeup(event_pollset); } } } + apr_thread_mutex_unlock(g_timer_skiplist_mtx); return te; } -static apr_status_t event_register_timed_callback_ex(apr_time_t t, +static void put_timer_event(timer_event_t *te, int locked) +{ + if (!locked) { + apr_thread_mutex_lock(g_timer_skiplist_mtx); + } + + memset(te, 0, sizeof(*te)); + APR_RING_INSERT_TAIL(&timer_free_ring.link, te, timer_event_t, link); + + if (!locked) { + apr_thread_mutex_unlock(g_timer_skiplist_mtx); + } +} + +static apr_status_t event_register_timed_callback_ex(apr_time_t timeout, ap_mpm_callback_fn_t *cbfn, - void *baton, + void *baton, apr_array_header_t *pfds) { - event_get_timer_event(t, cbfn, baton, 1, pfds); + if (!cbfn) { + return APR_EINVAL; + } + get_timer_event(timeout, cbfn, baton, 1, pfds); return APR_SUCCESS; } -static apr_status_t event_register_timed_callback(apr_time_t t, +static apr_status_t event_register_timed_callback(apr_time_t timeout, ap_mpm_callback_fn_t *cbfn, void *baton) { - event_register_timed_callback_ex(t, cbfn, baton, NULL); + event_register_timed_callback_ex(timeout, cbfn, baton, NULL); return APR_SUCCESS; } @@ -1687,6 +1883,10 @@ static apr_status_t event_cleanup_poll_callback(void *data) } } + if (final_rc) { + AP_DEBUG_ASSERT(0); + signal_threads(ST_GRACEFUL); + } return final_rc; } @@ -1697,18 +1897,24 @@ static apr_status_t event_register_poll_callback_ex(apr_pool_t *p, void *baton, apr_time_t timeout) { - socket_callback_baton_t *scb = apr_pcalloc(p, sizeof(*scb)); - listener_poll_type *pt = apr_palloc(p, sizeof(*pt)); + listener_poll_type *pt; + socket_callback_baton_t *scb; apr_status_t rc, final_rc = APR_SUCCESS; int i; - pt->type = PT_USER; - pt->baton = scb; + if (!cbfn || !tofn) { + return APR_EINVAL; + } + scb = apr_pcalloc(p, sizeof(*scb)); scb->cbfunc = cbfn; scb->user_baton = baton; scb->pfds = apr_array_copy(p, pfds); + pt = apr_palloc(p, sizeof(*pt)); + pt->type = PT_USER; + pt->baton = scb; + apr_pool_pre_cleanup_register(p, scb->pfds, event_cleanup_poll_callback); for (i = 0; i < scb->pfds->nelts; i++) { @@ -1725,9 +1931,12 @@ static apr_status_t event_register_poll_callback_ex(apr_pool_t *p, } } - if (timeout > 0) { - /* XXX: This cancel timer event can fire before the pollset is updated */ - scb->cancel_event = event_get_timer_event(timeout, tofn, baton, 1, scb->pfds); + if (timeout > 0) { + /* Prevent the timer from firing before the pollset is updated */ + if (timeout < TIMERS_FUDGE_TIMEOUT) { + timeout = TIMERS_FUDGE_TIMEOUT; + } + scb->cancel_event = get_timer_event(timeout, tofn, baton, 1, scb->pfds); } for (i = 0; i < scb->pfds->nelts; i++) { apr_pollfd_t *pfd = (apr_pollfd_t *)scb->pfds->elts + i; @@ -1766,14 +1975,13 @@ static apr_status_t event_register_poll_callback(apr_pool_t *p, #define LINGERING_BUF_SIZE (32 * 1024) static void process_lingering_close(event_conn_state_t *cs) { - apr_socket_t *csd = ap_get_conn_socket(cs->c); char dummybuf[LINGERING_BUF_SIZE]; - apr_size_t nbytes; + apr_socket_t *csd = cs_sd(cs); apr_status_t rv; - struct timeout_queue *q; ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, - "lingering close from state %i", (int)cs->pub.state); + "lingering close for connection %" CS_FMT, + CS_ARG(cs)); AP_DEBUG_ASSERT(CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)); if (!cs->linger_started) { @@ -1791,7 +1999,9 @@ static void process_lingering_close(event_conn_state_t *cs) close_connection(cs); return; } - + + notify_suspend(cs); + /* All nonblocking from now, no need for APR_INCOMPLETE_READ either */ apr_socket_timeout_set(csd, 0); apr_socket_opt_set(csd, APR_INCOMPLETE_READ, 0); @@ -1808,7 +2018,6 @@ static void process_lingering_close(event_conn_state_t *cs) cs->pub.state = CONN_STATE_LINGER_NORMAL; } cs->pub.sense = CONN_SENSE_DEFAULT; - notify_suspend(cs); /* One timestamp/duration for the whole lingering close time. * XXX: This makes the (short_)linger_q not sorted/ordered by expiring @@ -1821,32 +2030,18 @@ static void process_lingering_close(event_conn_state_t *cs) } do { - nbytes = sizeof(dummybuf); + apr_size_t nbytes = sizeof(dummybuf); rv = apr_socket_recv(csd, dummybuf, &nbytes); } while (rv == APR_SUCCESS); - - if (!APR_STATUS_IS_EAGAIN(rv)) { - close_connection(cs); - return; - } - - /* (Re)queue the connection to come back when readable */ - update_reqevents_from_sense(cs, CONN_SENSE_WANT_READ); - q = (cs->pub.state == CONN_STATE_LINGER_SHORT) ? short_linger_q : linger_q; - apr_thread_mutex_lock(timeout_mutex); - TO_QUEUE_APPEND(q, cs); - rv = apr_pollset_add(event_pollset, &cs->pfd); - if (rv != APR_SUCCESS && !APR_STATUS_IS_EEXIST(rv)) { - AP_DEBUG_ASSERT(0); - TO_QUEUE_REMOVE(q, cs); - apr_thread_mutex_unlock(timeout_mutex); - ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03092) - "process_lingering_close: apr_pollset_add failure"); - close_connection(cs); - signal_threads(ST_GRACEFUL); - return; + if (APR_STATUS_IS_EAGAIN(rv)) { + struct timeout_queue *q; + /* (Re)queue the connection to come back when readable */ + q = (cs->pub.state == CONN_STATE_LINGER_SHORT) ? short_linger_q : linger_q; + if (pollset_add(cs, CONN_SENSE_WANT_READ, q)) { + return; /* queued */ + } } - apr_thread_mutex_unlock(timeout_mutex); + close_connection(cs); } /* call 'func' for all elements of 'q' above 'expiry'. @@ -1860,7 +2055,6 @@ static void process_timeout_queue(struct timeout_queue *q, apr_time_t expiry, event_conn_state_t *first, *cs, *last; struct event_conn_state_t trash; struct timeout_queue *qp; - apr_status_t rv; if (!*q->total) { return; @@ -1891,19 +2085,29 @@ static void process_timeout_queue(struct timeout_queue *q, apr_time_t expiry, apr_time_t elem_expiry = cs->queue_timestamp + qp->timeout; apr_time_t next_expiry = queues_next_expiry; if (!next_expiry - || next_expiry > elem_expiry + TIMEOUT_FUDGE_FACTOR) { + || next_expiry > elem_expiry + QUEUES_FUDGE_TIMEOUT) { queues_next_expiry = elem_expiry; } break; } - last = cs; - rv = apr_pollset_remove(event_pollset, &cs->pfd); - if (rv != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rv)) { - AP_DEBUG_ASSERT(0); - ap_log_cerror(APLOG_MARK, APLOG_ERR, rv, cs->c, APLOGNO(00473) - "apr_pollset_remove failed"); + TO_QUEUE_REMOVE(qp, cs); + if (!pollset_del(cs, 1)) { + shutdown_connection(cs); + continue; } + + if (cs == first) { + APR_RING_INSERT_HEAD(&qp->head, cs, event_conn_state_t, + timeout_list); + } + else { + APR_RING_INSERT_AFTER(last, cs, timeout_list); + } + ++*qp->total; + ++qp->count; + + last = cs; cs = APR_RING_NEXT(cs, timeout_list); count++; } @@ -1925,7 +2129,7 @@ static void process_timeout_queue(struct timeout_queue *q, apr_time_t expiry, first = APR_RING_FIRST(&trash.timeout_list); do { cs = APR_RING_NEXT(first, timeout_list); - TO_QUEUE_ELEM_INIT(first); + APR_RING_ELEM_INIT(cs, timeout_list); func(first); first = cs; } while (--total); @@ -1950,8 +2154,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) apr_status_t rc; proc_info *ti = dummy; int process_slot = ti->pslot; - struct process_score *ps = ap_get_scoreboard_process(process_slot); - int closed = 0; + process_score *ps = ap_get_scoreboard_process(process_slot); int have_idle_worker = 0; apr_time_t last_log; @@ -1969,31 +2172,37 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) unblock_signal(LISTENER_SIGNAL); for (;;) { - timer_event_t *te; - const apr_pollfd_t *out_pfd; apr_int32_t num = 0; - apr_interval_time_t timeout; - socket_callback_baton_t *user_chain; - apr_time_t now, expiry = -1; + apr_time_t next_expiry = -1; + apr_interval_time_t timeout = -1; int workers_were_busy = 0; + socket_callback_baton_t *user_chain; + const apr_pollfd_t *out_pfd; + apr_time_t now; + event_conn_state_t *cs; + timer_event_t *te; - if (conns_this_child <= 0) + if (conns_this_child <= 0) { + /* Gracefuly stop (eventually) and keep going */ check_infinite_requests(); + } if (listener_may_exit) { - int first_close = close_listeners(&closed); + int once = !dying; + if (once) { + set_child_dying(); + } if (terminate_mode == ST_UNGRACEFUL || apr_atomic_read32(&connection_count) == 0) break; - /* Don't wait in poll() for the first close (i.e. dying now), we - * want to maintain the queues and schedule defer_linger_chain ASAP - * to kill kept-alive connection and shutdown the workers and child - * faster. - */ - if (first_close) { - goto do_maintenance; /* with expiry == -1 */ + if (once) { + /* Don't wait in poll() the first time (i.e. dying now), we + * want to maintain the queues ASAP to shutdown the workers + * and exit the child faster. + */ + goto do_maintenance; /* with next_expiry == -1 */ } } @@ -2002,8 +2211,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) /* trace log status every second */ if (now - last_log > apr_time_from_sec(1)) { ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf, - "connections: %u (waitio:%u write-completion:%u" - "keep-alive:%u lingering:%u suspended:%u clogged:%u), " + "connections: %u (waitio:%u write:%u keepalive:%u " + "lingering:%u suspended:%u clogged:%u), " "workers: %u/%u shutdown", apr_atomic_read32(&connection_count), apr_atomic_read32(waitio_q->total), @@ -2034,11 +2243,11 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) now = apr_time_now(); timeout = -1; - /* Push expired timers to a worker, the first remaining one determines - * the maximum time to poll() below, if any. + /* Push expired timers to a worker, the first remaining one (if any) + * determines the maximum time to poll() below. */ - expiry = timers_next_expiry; - if (expiry && expiry < now) { + next_expiry = timers_next_expiry; + if (next_expiry && next_expiry <= now) { apr_thread_mutex_lock(g_timer_skiplist_mtx); while ((te = apr_skiplist_peek(timer_skiplist))) { if (te->when > now) { @@ -2047,56 +2256,67 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) break; } apr_skiplist_pop(timer_skiplist, NULL); - if (!te->canceled) { - if (te->pfds) { - /* remove all sockets from the pollset */ - apr_pool_cleanup_run(te->pfds->pool, te->pfds, - event_cleanup_poll_callback); - } - push_timer2worker(te); + + if (te->canceled) { + put_timer_event(te, 1); + continue; } - else { - APR_RING_INSERT_TAIL(&timer_free_ring.link, te, - timer_event_t, link); + + if (te->pfds) { + /* remove all sockets from the pollset */ + apr_pool_cleanup_run(te->pfds->pool, te->pfds, + event_cleanup_poll_callback); } + push_timer2worker(te); + } + if (te) { + next_expiry = te->when; } - if (!te) { - timers_next_expiry = 0; + else { + next_expiry = 0; } + timers_next_expiry = next_expiry; apr_thread_mutex_unlock(g_timer_skiplist_mtx); } + if (next_expiry) { + timeout = next_expiry > now ? next_expiry - now : 0; + } /* Same for queues, use their next expiry, if any. */ - expiry = queues_next_expiry; - if (expiry - && (timeout < 0 - || expiry <= now - || timeout > expiry - now)) { - timeout = expiry > now ? expiry - now : 0; + next_expiry = queues_next_expiry; + if (next_expiry && (timeout < 0 || next_expiry - now < timeout)) { + timeout = next_expiry > now ? next_expiry - now : 0; } /* When non-wakeable, don't wait more than 100 ms, in any case. */ -#define NON_WAKEABLE_POLL_TIMEOUT apr_time_from_msec(100) - if (!listener_is_wakeable - && (timeout < 0 - || timeout > NON_WAKEABLE_POLL_TIMEOUT)) { - timeout = NON_WAKEABLE_POLL_TIMEOUT; + if (!listener_is_wakeable && (timeout < 0 || timeout > NON_WAKEABLE_TIMEOUT)) { + timeout = NON_WAKEABLE_TIMEOUT; } else if (timeout > 0) { - /* apr_pollset_poll() might round down the timeout to milliseconds, - * let's forcibly round up here to never return before the timeout. + /* apr_pollset_poll() might round down the timeout to + * milliseconds, let's forcibly round up here to never + * return before the timeout. */ timeout = apr_time_from_msec( apr_time_as_msec(timeout + apr_time_from_msec(1) - 1) ); } + /* Unpause listening sockets before poll()ing if possible */ + if (should_enable_listensocks()) { + enable_listensocks(); + } + ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf, - "polling with timeout=%" APR_TIME_T_FMT + "pollset: wait for timeout=%" APR_TIME_T_FMT " queues_timeout=%" APR_TIME_T_FMT - " timers_timeout=%" APR_TIME_T_FMT, - timeout, queues_next_expiry - now, - timers_next_expiry - now); + " timers_timeout=%" APR_TIME_T_FMT + " conns=%d exit=%d/%d", + timeout, + queues_next_expiry ? queues_next_expiry - now : -1, + timers_next_expiry ? timers_next_expiry - now : -1, + apr_atomic_read32(&connection_count), + listener_may_exit, dying); rc = apr_pollset_poll(event_pollset, timeout, &num, &out_pfd); if (rc != APR_SUCCESS) { @@ -2105,59 +2325,55 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) APLOGNO(03267) "apr_pollset_poll failed. Attempting to " "shutdown process gracefully"); + AP_DEBUG_ASSERT(0); signal_threads(ST_GRACEFUL); } num = 0; } if (APLOGtrace7(ap_server_conf)) { + apr_time_t old_now = now; now = apr_time_now(); + ap_log_error(APLOG_MARK, APLOG_TRACE7, rc, ap_server_conf, - "polled with num=%u exit=%d/%d conns=%d" + "pollset: have #%i time=%" APR_TIME_T_FMT "/%" APR_TIME_T_FMT " queues_timeout=%" APR_TIME_T_FMT - " timers_timeout=%" APR_TIME_T_FMT, - num, listener_may_exit, dying, + " timers_timeout=%" APR_TIME_T_FMT + " conns=%d exit=%d/%d", + (int)num, now - old_now, timeout, + queues_next_expiry ? queues_next_expiry - now : -1, + timers_next_expiry ? timers_next_expiry - now : -1, apr_atomic_read32(&connection_count), - queues_next_expiry - now, timers_next_expiry - now); + listener_may_exit, dying); } /* XXX possible optimization: stash the current time for use as * r->request_time for new requests or queues maintenance */ - for (user_chain = NULL; num; --num, ++out_pfd) { - listener_poll_type *pt = (listener_poll_type *) out_pfd->client_data; + for (user_chain = NULL; num > 0; --num, ++out_pfd) { + listener_poll_type *pt = out_pfd->client_data; + if (pt->type == PT_CSD) { /* one of the sockets is readable */ - event_conn_state_t *cs = (event_conn_state_t *) pt->baton; - struct timeout_queue *remove_from_q = NULL; - /* don't wait for a worker for a keepalive request or - * lingering close processing. */ - int blocking = 0; - - switch (cs->pub.state) { - case CONN_STATE_WRITE_COMPLETION: - remove_from_q = cs->sc->wc_q; - blocking = 1; - break; + int blocking = 1; - case CONN_STATE_ASYNC_WAITIO: - cs->pub.state = CONN_STATE_PROCESSING; - remove_from_q = cs->sc->io_q; - blocking = 1; - break; + cs = (event_conn_state_t *) pt->baton; + ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, + "polled connection %" CS_FMT, + CS_ARG(cs)); + switch (cs->pub.state) { case CONN_STATE_KEEPALIVE: + case CONN_STATE_ASYNC_WAITIO: cs->pub.state = CONN_STATE_PROCESSING; - remove_from_q = cs->sc->ka_q; + case CONN_STATE_WRITE_COMPLETION: break; case CONN_STATE_LINGER_NORMAL: - remove_from_q = linger_q; - break; - case CONN_STATE_LINGER_SHORT: - remove_from_q = short_linger_q; + /* don't wait for a worker for lingering close processing. */ + blocking = 0; break; default: @@ -2168,26 +2384,12 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) ap_assert(0); } - if (remove_from_q) { - apr_thread_mutex_lock(timeout_mutex); - TO_QUEUE_REMOVE(remove_from_q, cs); - rc = apr_pollset_remove(event_pollset, &cs->pfd); - apr_thread_mutex_unlock(timeout_mutex); - /* - * Some of the pollset backends, like KQueue or Epoll - * automagically remove the FD if the socket is closed, - * therefore, we can accept _SUCCESS or _NOTFOUND, - * and we still want to keep going - */ - if (rc != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rc)) { - AP_DEBUG_ASSERT(0); - ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf, - APLOGNO(03094) "pollset remove failed"); - close_connection(cs); - signal_threads(ST_GRACEFUL); - break; - } + if (!pollset_del(cs, 0)) { + shutdown_connection(cs); + continue; + } + { /* If we don't get a worker immediately (nonblocking), we * close the connection; the client can re-connect to a * different process for keepalive, and for lingering close @@ -2269,14 +2471,21 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) resource_shortage = 1; signal_threads(ST_GRACEFUL); } - else if (ap_accept_error_is_nonfatal(rc)) { - ap_log_error(APLOG_MARK, APLOG_DEBUG, rc, ap_server_conf, + else if (ap_accept_error_is_nonfatal(rc)) { + ap_log_error(APLOG_MARK, APLOG_DEBUG, rc, ap_server_conf, "accept() on client socket failed"); } if (csd != NULL) { conns_this_child--; - if (push2worker(NULL, csd, ptrans) == APR_SUCCESS) { + + /* Create and account for the connection from here, or + * a graceful shutdown happening before it's processed + * would consider it does not exist and could exit the + * child too early. + */ + cs = make_conn_state(ptrans, csd); + if (push2worker(cs, NULL, NULL) == APR_SUCCESS) { have_idle_worker = 0; } } @@ -2304,7 +2513,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) * with the user callback being called while we handle * the same baton multiple times here. */ - if (!baton->signaled) { + if (!baton->signaled) { baton->signaled = 1; baton->next = user_chain; user_chain = baton; @@ -2312,7 +2521,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) } } /* for processing poll */ - /* Time to handle user callbacks chained above */ + /* Time to queue user callbacks chained above */ while (user_chain) { socket_callback_baton_t *baton = user_chain; user_chain = user_chain->next; @@ -2323,30 +2532,31 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) event_cleanup_poll_callback); /* masquerade as a timer event that is firing */ - te = event_get_timer_event(-1 /* fake timer */, - baton->cbfunc, - baton->user_baton, - 0, /* don't insert it */ - NULL /* no associated socket callback */); + te = get_timer_event(-1 /* fake timer */, + baton->cbfunc, + baton->user_baton, + 0, /* don't insert it */ + NULL /* no associated socket callback */); push_timer2worker(te); } /* We process the timeout queues here only when the global - * queues_next_expiry is passed. This happens accurately since + * queues_next_expiry has passed. This happens accurately since * adding to the queues (in workers) can only decrease this expiry, * while latest ones are only taken into account here (in listener) * during queues' processing, with the lock held. This works both * with and without wake-ability. */ - expiry = queues_next_expiry; + next_expiry = queues_next_expiry; do_maintenance: - if (expiry && expiry < (now = apr_time_now())) { + if (next_expiry && next_expiry <= (now = apr_time_now())) { ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf, - "queues maintenance with timeout=%" APR_TIME_T_FMT, - expiry > 0 ? expiry - now : -1); + "queues maintenance: expired=%" APR_TIME_T_FMT, + next_expiry > 0 ? now - next_expiry : -1); + apr_thread_mutex_lock(timeout_mutex); - /* Steps below will recompute this. */ + /* Recompute this by walking the timeout queues (under the lock) */ queues_next_expiry = 0; /* Step 1: keepalive queue timeouts are closed */ @@ -2373,11 +2583,12 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) /* Step 5: short lingering close queue timeouts are closed */ process_timeout_queue(short_linger_q, now, shutdown_connection); + next_expiry = queues_next_expiry; apr_thread_mutex_unlock(timeout_mutex); + ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf, - "queues maintained with timeout=%" APR_TIME_T_FMT, - queues_next_expiry > now ? queues_next_expiry - now - : -1); + "queues maintained: next timeout=%" APR_TIME_T_FMT, + next_expiry ? next_expiry - now : -1); ps->wait_io = apr_atomic_read32(waitio_q->total); ps->write_completion = apr_atomic_read32(write_completion_q->total); @@ -2411,12 +2622,11 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) have_idle_worker = 0; } } - - if (!workers_were_busy && should_enable_listensocks()) { - enable_listensocks(); - } } /* listener main loop */ + ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf, + "listener thread exiting"); + ap_queue_term(worker_queue); apr_thread_exit(thd, APR_SUCCESS); @@ -2429,23 +2639,25 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) * * return 1 if thread should exit, 0 if it should continue running. */ -static int worker_thread_should_exit_early(void) +static int worker_thread_should_exit_early(int slot) { + const apr_uint32_t max = threads_per_child; for (;;) { apr_uint32_t conns = apr_atomic_read32(&connection_count); - apr_uint32_t dead = apr_atomic_read32(&threads_shutdown); - apr_uint32_t newdead; + apr_uint32_t deads = apr_atomic_read32(&threads_shutdown); - AP_DEBUG_ASSERT(dead <= threads_per_child); - if (conns >= threads_per_child - dead) + AP_DEBUG_ASSERT(deads < max); + if (conns >= max - deads) return 0; - newdead = dead + 1; - if (apr_atomic_cas32(&threads_shutdown, newdead, dead) == dead) { + if (apr_atomic_cas32(&threads_shutdown, deads + 1, deads) == deads) { /* * No other thread has exited in the mean time, safe to exit * this one. */ + ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf, + "worker thread %i/%i-%i should exit (%i conns)", + slot, threads_per_child, deads + 1, conns); return 1; } } @@ -2463,20 +2675,21 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy) proc_info *ti = dummy; int process_slot = ti->pslot; int thread_slot = ti->tslot; + worker_score *ws = &ap_scoreboard_image->servers[process_slot][thread_slot]; apr_status_t rv; int is_idle = 0; free(ti); - ap_scoreboard_image->servers[process_slot][thread_slot].pid = ap_my_pid; - ap_scoreboard_image->servers[process_slot][thread_slot].tid = apr_os_thread_current(); - ap_scoreboard_image->servers[process_slot][thread_slot].generation = retained->mpm->my_generation; + ws->pid = ap_my_pid; + ws->tid = apr_os_thread_current(); + ws->generation = retained->mpm->my_generation; ap_update_child_status_from_indexes(process_slot, thread_slot, SERVER_STARTING, NULL); for (;;) { apr_socket_t *csd = NULL; - event_conn_state_t *cs; + event_conn_state_t *cs = NULL; timer_event_t *te = NULL; apr_pool_t *ptrans; /* Pool for per-transaction stuff */ @@ -2490,23 +2703,33 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy) signal_threads(ST_GRACEFUL); break; } - /* A new idler may have changed connections_above_limit(), - * let the listener know and decide. + ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf, + "worker thread %i/%i idle (idlers %i)", + thread_slot, threads_per_child, + ap_queue_info_num_idlers(worker_queue_info)); + is_idle = 1; + + /* If the listening sockets are paused and this new idler switches + * connections_above_limit() back, let the listener know and poll + * them again. */ if (listener_is_wakeable && should_enable_listensocks()) { apr_pollset_wakeup(event_pollset); } - is_idle = 1; } ap_update_child_status_from_indexes(process_slot, thread_slot, dying ? SERVER_GRACEFUL - : SERVER_READY, NULL); - worker_pop: + : SERVER_READY, + NULL); + if (workers_may_exit) { + ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf, + "worker thread %i/%i may exit", + thread_slot, threads_per_child); break; } - if (dying && worker_thread_should_exit_early()) { + if (dying && worker_thread_should_exit_early(thread_slot)) { break; } @@ -2518,8 +2741,12 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy) * connections accepted by this server process have been handled. */ if (APR_STATUS_IS_EOF(rv)) { + ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf, + "worker thread %i/%i queue terminated", + thread_slot, threads_per_child); break; } + /* We get APR_EINTR whenever ap_queue_pop_*() has been interrupted * from an explicit call to ap_queue_interrupt_all(). This allows * us to unblock threads stuck in ap_queue_pop_*() when a shutdown @@ -2531,26 +2758,29 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy) * may have already been cleaned up. Don't log the "error" if * workers_may_exit is set. */ - else if (APR_STATUS_IS_EINTR(rv)) { - goto worker_pop; - } - /* We got some other error. */ - else if (!workers_may_exit) { + if (!APR_STATUS_IS_EINTR(rv) && !workers_may_exit) { ap_log_error(APLOG_MARK, APLOG_CRIT, rv, ap_server_conf, - APLOGNO(03099) "ap_queue_pop_socket failed"); + APLOGNO(03099) "ap_queue_pop_something failed"); + AP_DEBUG_ASSERT(0); + signal_threads(ST_GRACEFUL); } continue; } + + ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf, + "worker thread %i/%i busy (idlers %i)", + thread_slot, threads_per_child, + ap_queue_info_num_idlers(worker_queue_info)); + if (te != NULL) { - te->cbfunc(te->baton); - { - apr_thread_mutex_lock(g_timer_skiplist_mtx); - APR_RING_INSERT_TAIL(&timer_free_ring.link, te, timer_event_t, link); - apr_thread_mutex_unlock(g_timer_skiplist_mtx); - } + void *baton = te->baton; + ap_mpm_callback_fn_t *cbfunc = te->cbfunc; + /* first recycle the timer event */ + put_timer_event(te, 0); + cbfunc(baton); } else { - is_idle = 0; + is_idle = 0; /* consumed */ if (csd != NULL) { worker_sockets[thread_slot] = csd; process_socket(thd, ptrans, csd, cs, process_slot, thread_slot); @@ -2572,15 +2802,23 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy) cs->chain = NULL; AP_DEBUG_ASSERT(cs->pub.state == CONN_STATE_LINGER); - worker_sockets[thread_slot] = csd = cs->pfd.desc.s; + ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, + "deferred close for connection %" CS_FMT, CS_ARG(cs)); + + worker_sockets[thread_slot] = csd = cs_sd(cs); process_socket(thd, cs->p, csd, cs, process_slot, thread_slot); worker_sockets[thread_slot] = NULL; } } + if (is_idle) { + /* Not idling anymore */ + ap_queue_info_wait_for_idler(worker_queue_info, NULL); + } ap_update_child_status_from_indexes(process_slot, thread_slot, dying ? SERVER_DEAD - : SERVER_GRACEFUL, NULL); + : SERVER_GRACEFUL, + NULL); apr_thread_exit(thd, APR_SUCCESS); return NULL; @@ -2623,14 +2861,14 @@ static void setup_threads_runtime(void) ap_listen_rec *lr; apr_pool_t *pskip = NULL; int max_recycled_pools = -1, i; - const int good_methods[] = { APR_POLLSET_KQUEUE, - APR_POLLSET_PORT, + const int good_methods[] = { APR_POLLSET_PORT, + APR_POLLSET_KQUEUE, APR_POLLSET_EPOLL }; /* XXX: K-A or lingering close connection included in the async factor */ - const apr_uint32_t async_factor = worker_factor / WORKER_FACTOR_SCALE; - const apr_uint32_t pollset_size = (apr_uint32_t)num_listensocks + - (apr_uint32_t)threads_per_child * - (async_factor > 2 ? async_factor : 2); + const unsigned int threads_factor = worker_factor / WORKER_FACTOR_SCALE; + const apr_size_t pollset_size = ((unsigned int)num_listensocks + + (unsigned int)threads_per_child * + (threads_factor > 2 ? threads_factor : 2)); int pollset_flags; /* Event's skiplist operations will happen concurrently with other modules' @@ -2730,13 +2968,13 @@ static void setup_threads_runtime(void) } /* Add listeners to the main pollset */ - listener_pollfd = apr_pcalloc(pruntime, num_listensocks * - sizeof(apr_pollfd_t)); + listener_pollfd = apr_pcalloc(pruntime, + num_listensocks * sizeof(apr_pollfd_t)); for (i = 0, lr = my_bucket->listeners; lr; lr = lr->next, i++) { apr_pollfd_t *pfd; listener_poll_type *pt; - AP_DEBUG_ASSERT(i < num_listensocks); + ap_assert(i < num_listensocks); pfd = &listener_pollfd[i]; pfd->reqevents = APR_POLLIN | APR_POLLHUP | APR_POLLERR; @@ -2758,7 +2996,12 @@ static void setup_threads_runtime(void) pt->baton = lr; apr_socket_opt_set(pfd->desc.s, APR_SO_NONBLOCK, 1); - apr_pollset_add(event_pollset, pfd); + rv = apr_pollset_add(event_pollset, pfd); + if (rv != APR_SUCCESS) { + ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(10473) + "apr_pollset_add for listener failed."); + clean_child_exit(APEXIT_CHILDFATAL); + } lr->accept_func = ap_unixd_accept; } @@ -2906,7 +3149,7 @@ static void join_workers(apr_thread_t * listener, apr_thread_t ** threads) /* listener has not stopped accepting yet */ ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, "listener has not stopped accepting yet (%d iter)", iter); - wakeup_listener(); + shutdown_listener(); } if (iter > 10) { ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, APLOGNO(00475) @@ -2922,6 +3165,9 @@ static void join_workers(apr_thread_t * listener, apr_thread_t ** threads) } for (i = 0; i < threads_per_child; i++) { + ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf, + "apr_thread_join: joining thread %pp (%i/%i)", + threads[i], i, threads_per_child); if (threads[i]) { /* if we ever created this thread */ rv = apr_thread_join(&thread_rv, threads[i]); if (rv != APR_SUCCESS) { @@ -3043,7 +3289,7 @@ static void child_main(int child_num_arg, int child_bucket) if (rv != APR_SUCCESS && rv != APR_ENOTIMPL) { ap_log_error(APLOG_MARK, APLOG_WARNING, rv, ap_server_conf, APLOGNO(02436) "WARNING: ThreadStackSize of %" APR_SIZE_T_FMT " is " - "inappropriate, using default", + "inappropriate, using default", ap_thread_stacksize); } } @@ -3384,7 +3630,7 @@ static void perform_idle_server_maintenance(void) retained->maxclients_reported = 1; } } - else { + else { if (!retained->near_maxclients_reported) { ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf, APLOGNO(10159) "server is within MinSpareThreads of " @@ -3490,7 +3736,7 @@ static void server_main_loop(int remaining_children_to_start) child_slot = ap_find_child_by_pid(&pid); if (processed_status == APEXIT_CHILDFATAL) { /* fix race condition found in PR 39311 - * A child created at the same time as a graceful happens + * A child created at the same time as a graceful happens * can find the lock missing and create a fatal error. * It is not fatal for the last generation to be in this state. */ @@ -3866,25 +4112,23 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s) return OK; } -static void setup_slave_conn(conn_rec *c, void *csd) +static void setup_slave_conn(conn_rec *c, void *csd) { event_conn_state_t *mcs; event_conn_state_t *cs; - + mcs = ap_get_module_config(c->master->conn_config, &mpm_event_module); - - cs = apr_pcalloc(c->pool, sizeof(*cs)); + + cs = make_conn_state(c->pool, csd); cs->c = c; - cs->r = NULL; cs->sc = mcs->sc; cs->suspended = 0; - cs->p = c->pool; cs->bucket_alloc = c->bucket_alloc; cs->pfd = mcs->pfd; cs->pub = mcs->pub; cs->pub.state = CONN_STATE_PROCESSING; cs->pub.sense = CONN_SENSE_DEFAULT; - + c->cs = &(cs->pub); ap_set_module_config(c->conn_config, &mpm_event_module, cs); } @@ -3908,7 +4152,7 @@ static int event_protocol_switch(conn_rec *c, request_rec *r, server_rec *s, * other than http/1.1, this might never happen. */ event_conn_state_t *cs; - + cs = ap_get_module_config(c->conn_config, &mpm_event_module); cs->sc = ap_get_module_config(s->module_config, &mpm_event_module); } @@ -3932,7 +4176,11 @@ static int event_open_logs(apr_pool_t * p, apr_pool_t * plog, level_flags |= APLOG_STARTUP; } - if ((num_listensocks = ap_setup_listeners(ap_server_conf)) < 1) { + /* This sets up new listeners or reuses existing ones, as well as cleaning + * up unused ones from the previous generation. + */ + num_listensocks = ap_setup_listeners(ap_server_conf); + if (num_listensocks < 1) { ap_log_error(APLOG_MARK, APLOG_ALERT | level_flags, 0, (startup ? NULL : s), APLOGNO(03272) "no listening sockets available, shutting down"); @@ -4045,74 +4293,34 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog, static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog, apr_pool_t *ptemp, server_rec *s) { - struct { - struct timeout_queue *tail, *q; - apr_hash_t *hash; - } io, wc, ka; + apr_hash_t *io_h, *wc_h, *ka_h; /* Not needed in pre_config stage */ if (ap_state_query(AP_SQ_MAIN_STATE) == AP_SQ_MS_CREATE_PRE_CONFIG) { return OK; } - io.hash = apr_hash_make(ptemp); - wc.hash = apr_hash_make(ptemp); - ka.hash = apr_hash_make(ptemp); - io.tail = wc.tail = ka.tail = NULL; + io_h = apr_hash_make(ptemp); + wc_h = apr_hash_make(ptemp); + ka_h = apr_hash_make(ptemp); - linger_q = TO_QUEUE_MAKE(pconf, apr_time_from_sec(MAX_SECS_TO_LINGER), - NULL); - short_linger_q = TO_QUEUE_MAKE(pconf, apr_time_from_sec(SECONDS_TO_LINGER), - NULL); + linger_q = TO_QUEUE_MAKE(pconf, "linger", + apr_time_from_sec(MAX_SECS_TO_LINGER), NULL); + short_linger_q = TO_QUEUE_MAKE(pconf, "short_linger", + apr_time_from_sec(SECONDS_TO_LINGER), NULL); for (; s; s = s->next) { event_srv_cfg *sc = apr_pcalloc(pconf, sizeof *sc); - ap_set_module_config(s->module_config, &mpm_event_module, sc); - if (!io.tail) { - /* The main server uses the global queues */ - io.q = TO_QUEUE_MAKE(pconf, s->timeout, NULL); - apr_hash_set(io.hash, &s->timeout, sizeof s->timeout, io.q); - io.tail = waitio_q = io.q; - - wc.q = TO_QUEUE_MAKE(pconf, s->timeout, NULL); - apr_hash_set(wc.hash, &s->timeout, sizeof s->timeout, wc.q); - wc.tail = write_completion_q = wc.q; - - ka.q = TO_QUEUE_MAKE(pconf, s->keep_alive_timeout, NULL); - apr_hash_set(ka.hash, &s->keep_alive_timeout, - sizeof s->keep_alive_timeout, ka.q); - ka.tail = keepalive_q = ka.q; - } - else { - /* The vhosts use any existing queue with the same timeout, - * or their own queue(s) if there isn't */ - io.q = apr_hash_get(io.hash, &s->timeout, sizeof s->timeout); - if (!io.q) { - io.q = TO_QUEUE_MAKE(pconf, s->timeout, io.tail); - apr_hash_set(io.hash, &s->timeout, sizeof s->timeout, io.q); - io.tail = io.tail->next = io.q; - } - wc.q = apr_hash_get(wc.hash, &s->timeout, sizeof s->timeout); - if (!wc.q) { - wc.q = TO_QUEUE_MAKE(pconf, s->timeout, wc.tail); - apr_hash_set(wc.hash, &s->timeout, sizeof s->timeout, wc.q); - wc.tail = wc.tail->next = wc.q; - } + sc->io_q = TO_QUEUE_CHAIN(pconf, "waitio", s->timeout, + &waitio_q, io_h, ptemp); - ka.q = apr_hash_get(ka.hash, &s->keep_alive_timeout, - sizeof s->keep_alive_timeout); - if (!ka.q) { - ka.q = TO_QUEUE_MAKE(pconf, s->keep_alive_timeout, ka.tail); - apr_hash_set(ka.hash, &s->keep_alive_timeout, - sizeof s->keep_alive_timeout, ka.q); - ka.tail = ka.tail->next = ka.q; - } - } - sc->io_q = io.q; - sc->wc_q = wc.q; - sc->ka_q = ka.q; + sc->wc_q = TO_QUEUE_CHAIN(pconf, "write_completion", s->timeout, + &write_completion_q, wc_h, ptemp); + + sc->ka_q = TO_QUEUE_CHAIN(pconf, "keepalive", s->keep_alive_timeout, + &keepalive_q, ka_h, ptemp); } return OK; @@ -4430,7 +4638,7 @@ static const char *set_threads_per_child(cmd_parms * cmd, void *dummy, threads_per_child = atoi(arg); return NULL; } -static const char *set_server_limit (cmd_parms *cmd, void *dummy, const char *arg) +static const char *set_server_limit(cmd_parms *cmd, void *dummy, const char *arg) { const char *err = ap_check_cmd_context(cmd, GLOBAL_ONLY); if (err != NULL) { diff --git a/server/mpm/motorz/motorz.c b/server/mpm/motorz/motorz.c index 7026d08cd6e..e06aeab573b 100644 --- a/server/mpm/motorz/motorz.c +++ b/server/mpm/motorz/motorz.c @@ -380,8 +380,8 @@ static apr_status_t motorz_io_process(motorz_conn_t *scon) scon->cs.state = CONN_STATE_PROCESSING; } -read_request: if (scon->cs.state == CONN_STATE_PROCESSING) { + process_connection: ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, APLOGNO(03328) "motorz_io_process(): CONN_STATE_PROCESSING"); if (!c->aborted) { @@ -432,14 +432,14 @@ static apr_status_t motorz_io_process(motorz_conn_t *scon) } return APR_SUCCESS; } - if (c->keepalive != AP_CONN_KEEPALIVE) { - pending = DONE; - } - else if (pending == OK) { - pending = ap_check_input_pending(c); + if (pending == OK) { + /* Some data to process immediately? */ + pending = (c->keepalive == AP_CONN_KEEPALIVE + ? ap_check_input_pending(c) + : DONE); if (pending == AGAIN) { scon->cs.state = CONN_STATE_PROCESSING; - goto read_request; + goto process_connection; } } if (pending == OK) { diff --git a/server/mpm/simple/simple_io.c b/server/mpm/simple/simple_io.c index 36c5ad87956..154c9a2c1d3 100644 --- a/server/mpm/simple/simple_io.c +++ b/server/mpm/simple/simple_io.c @@ -126,11 +126,11 @@ static apr_status_t simple_io_process(simple_conn_t * scon) } return APR_SUCCESS; } - if (c->keepalive != AP_CONN_KEEPALIVE) { - pending = DONE; - } - else if (pending == OK) { - pending = ap_check_input_pending(c); + if (pending == OK) { + /* Some data to process immediately? */ + pending = (c->keepalive == AP_CONN_KEEPALIVE + ? ap_check_input_pending(c) + : DONE); if (pending == AGAIN) { scon->cs.state = CONN_STATE_PROCESSING; continue; From db8ec1e53750901d0acf44a59e6346f9bd9c7b90 Mon Sep 17 00:00:00 2001 From: ylavic Date: Tue, 1 Feb 2022 22:47:38 +0100 Subject: [PATCH 04/22] mpm_event: Use monotonic timestamps if available. If clock_gettime() and CLOCK_MONOTONIC are defined (i.e. most/all? unixes), use them to provide a timestamp that never goes past (even if the admin changes the system time). This avoids entries potentially suddenly expiring in centuries on a bad clock skew. * configure.in(): Provide HAVE_TIME_H, HAVE_CLOCK_GETTIME and HAVE_CLOCK_GETRES. * server/mpm/event/event.c(event_time_now): New helper to get a monotonic timestamp from clock_gettime() if it's available, or apr_time_now() (i.e. gettimeofday()) otherwise. * server/mpm/event/event.c(process_socket, event_resume_suspended, event_get_timer_event, process_lingering_close, listener_thread, event_run): Use event_time_now(). --- configure.in | 5 ++ server/mpm/event/event.c | 112 +++++++++++++++++++++++++++++++++++---- 2 files changed, 107 insertions(+), 10 deletions(-) diff --git a/configure.in b/configure.in index c56c8972afd..4b2098d8034 100644 --- a/configure.in +++ b/configure.in @@ -471,6 +471,8 @@ AC_CHECK_HEADERS( \ string.h \ limits.h \ unistd.h \ +time.h \ +mach/mach_time.h \ sys/socket.h \ pwd.h \ grp.h \ @@ -534,6 +536,9 @@ getpwnam \ getgrnam \ initgroups \ bindprocessor \ +clock_getres \ +clock_gettime \ +clock_gettime_nsec_np \ prctl \ procctl \ pthread_getthreadid_np \ diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 64ff1e30ead..795f4b1f37c 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -73,6 +73,9 @@ #ifdef HAVE_SYS_PROCESSOR_H #include /* for bindprocessor() */ #endif +#ifdef HAVE_TIME_H +#include /* for clock_gettime() */ +#endif #if !APR_HAS_THREADS #error The Event MPM requires APR threads, but they are unavailable. @@ -336,6 +339,93 @@ static APR_INLINE const char *cs_state_str(event_conn_state_t *cs) */ static event_conn_state_t *volatile defer_linger_chain; +#define USE_CLOCK_COARSE 0 /* not for now */ +#if HAVE_CLOCK_GETTIME && defined(CLOCK_MONOTONIC) /* POSIX */ +static clockid_t event_clockid; +#elif HAVE_CLOCK_GETTIME_NSEC_NP && defined(CLOCK_UPTIME_RAW) /* Newer OSX */ +/* All #include'd by already */ +#elif HAVE_MACH_MACH_TIME_H /* Older OSX */ +#include +#endif + +static void event_time_init(void) +{ +#if HAVE_CLOCK_GETTIME && defined(CLOCK_MONOTONIC) + event_clockid = (clockid_t)-1; + +#if HAVE_CLOCK_GETRES && defined(CLOCK_MONOTONIC_COARSE) && USE_CLOCK_COARSE + if (event_clockid == (clockid_t)-1) { + struct timespec ts; + if (clock_getres(CLOCK_MONOTONIC_COARSE, &ts) == 0) { + apr_time_t res = apr_time_from_sec(ts.tv_sec) + ts.tv_nsec / 1000; + if (res <= TIMERS_FUDGE_TIMEOUT) { + event_clockid = CLOCK_MONOTONIC_COARSE; + } + } + } +#endif /* CLOCK_MONOTONIC_COARSE */ + +#if HAVE_CLOCK_GETRES && defined(CLOCK_MONOTONIC_FAST) && USE_CLOCK_COARSE + if (event_clockid == (clockid_t)-1) { + struct timespec ts; + if (clock_getres(CLOCK_MONOTONIC_FAST, &ts) == 0) { + apr_time_t res = apr_time_from_sec(ts.tv_sec) + ts.tv_nsec / 1000; + if (res <= TIMERS_FUDGE_TIMEOUT) { + event_clockid = CLOCK_MONOTONIC_FAST; + } + } + } +#endif /* CLOCK_MONOTONIC_FAST */ + +#if HAVE_CLOCK_GETRES && defined(CLOCK_MONOTONIC_RAW_APPROX) && USE_CLOCK_COARSE + if (event_clockid == (clockid_t)-1) { + struct timespec ts; + if (clock_getres(CLOCK_MONOTONIC_RAW_APPROX, &ts) == 0) { + apr_time_t res = apr_time_from_sec(ts.tv_sec) + ts.tv_nsec / 1000; + if (res <= TIMERS_FUDGE_TIMEOUT) { + event_clockid = CLOCK_MONOTONIC_RAW_APPROX; + } + } + } +#endif /* CLOCK_MONOTONIC_RAW_APPROX */ + + if (event_clockid == (clockid_t)-1) { +#if defined(CLOCK_MONOTONIC_RAW) + event_clockid = CLOCK_MONOTONIC_RAW; +#else + event_clockid = CLOCK_MONOTONIC; +#endif + } + +#endif /* HAVE_CLOCK_GETTIME */ +} + +static apr_time_t event_time_now(void) +{ +#if HAVE_CLOCK_GETTIME && defined(CLOCK_MONOTONIC) + + struct timespec ts; + clock_gettime(event_clockid, &ts); + return apr_time_from_sec(ts.tv_sec) + ts.tv_nsec / 1000; + +#elif HAVE_CLOCK_GETTIME_NSEC_NP && defined(CLOCK_UPTIME_RAW) + + return clock_gettime_nsec_np(CLOCK_UPTIME_RAW) / 1000; + +#elif HAVE_MACH_MACH_TIME_H + + mach_timebase_info_data_t ti; + mach_timebase_info(&ti); + return mach_continuous_time() * ti.numer / ti.denom / 1000; + +#else + + /* XXX: not monotonic, still some platform to care about? */ + return apr_time_now(); + +#endif +} + APR_RING_HEAD(timeout_head_t, event_conn_state_t); struct timeout_queue { struct timeout_head_t head; @@ -375,7 +465,7 @@ static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *cs) ap_assert(q && !cs->q); cs->q = q; - cs->queue_timestamp = apr_time_now(); + cs->queue_timestamp = event_time_now(); APR_RING_INSERT_TAIL(&q->head, cs, event_conn_state_t, timeout_list); ++*q->total; ++q->count; @@ -1786,7 +1876,7 @@ static timer_event_t *get_timer_event(apr_time_t timeout, apr_array_header_t *pfds) { timer_event_t *te; - apr_time_t now = (timeout < 0) ? 0 : apr_time_now(); + apr_time_t now = (timeout < 0) ? 0 : event_time_now(); /* oh yeah, and make locking smarter/fine grained. */ @@ -2158,7 +2248,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) int have_idle_worker = 0; apr_time_t last_log; - last_log = apr_time_now(); + last_log = event_time_now(); free(ti); #if HAVE_SERF @@ -2207,7 +2297,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) } if (APLOGtrace6(ap_server_conf)) { - now = apr_time_now(); + now = event_time_now(); /* trace log status every second */ if (now - last_log > apr_time_from_sec(1)) { ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf, @@ -2240,7 +2330,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) * up occurs, otherwise periodic checks (maintenance, shutdown, ...) * must be performed. */ - now = apr_time_now(); + now = event_time_now(); timeout = -1; /* Push expired timers to a worker, the first remaining one (if any) @@ -2333,7 +2423,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) if (APLOGtrace7(ap_server_conf)) { apr_time_t old_now = now; - now = apr_time_now(); + now = event_time_now(); ap_log_error(APLOG_MARK, APLOG_TRACE7, rc, ap_server_conf, "pollset: have #%i time=%" APR_TIME_T_FMT "/%" APR_TIME_T_FMT @@ -2549,7 +2639,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) */ next_expiry = queues_next_expiry; do_maintenance: - if (next_expiry && next_expiry <= (now = apr_time_now())) { + if (next_expiry && next_expiry <= (now = event_time_now())) { ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf, "queues maintenance: expired=%" APR_TIME_T_FMT, next_expiry > 0 ? now - next_expiry : -1); @@ -3257,7 +3347,7 @@ static void child_main(int child_num_arg, int child_bucket) } /* For rand() users (e.g. skiplist). */ - srand((unsigned int)apr_time_now()); + srand((unsigned int)event_time_now()); ap_run_child_init(pchild, ap_server_conf); @@ -4057,7 +4147,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s) } if (ap_graceful_shutdown_timeout) { - cutoff = apr_time_now() + + cutoff = event_time_now() + apr_time_from_sec(ap_graceful_shutdown_timeout); } @@ -4079,7 +4169,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s) } } } while (!retained->mpm->shutdown_pending && active_children && - (!ap_graceful_shutdown_timeout || apr_time_now() < cutoff)); + (!ap_graceful_shutdown_timeout || event_time_now() < cutoff)); /* We might be here because we received SIGTERM, either * way, try and make sure that all of our processes are @@ -4210,6 +4300,8 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog, foreground = ap_exists_config_define("FOREGROUND"); } + event_time_init(); + retained = ap_retained_data_get(userdata_key); if (!retained) { retained = ap_retained_data_create(userdata_key, sizeof(*retained)); From 8f3ed4cc7a3d20b864ee898930e189e39cce55fa Mon Sep 17 00:00:00 2001 From: ylavic Date: Tue, 9 Jul 2024 15:53:33 +0200 Subject: [PATCH 05/22] mpm_event: No need/use of "clogged" connections count, axe. --- server/mpm/event/event.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 795f4b1f37c..4e544ccdec9 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -205,7 +205,6 @@ static volatile int listener_may_exit = 0; static apr_uint32_t connection_count = 0; /* Number of open connections */ static apr_uint32_t lingering_count = 0; /* Number of connections in lingering close */ static apr_uint32_t suspended_count = 0; /* Number of suspended connections */ -static apr_uint32_t clogged_count = 0; /* Number of threads processing ssl conns */ static apr_uint32_t threads_shutdown = 0; /* Number of threads that have shutdown early during graceful termination */ static int had_healthy_child = 0; @@ -703,8 +702,7 @@ static void disable_listensocks(void) ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(10381) "Suspend listening sockets: idlers:%i conns:%u " - "waitio:%u write:%u keepalive:%u linger:%u/%u " - "suspended:%u clogged:%u", + "waitio:%u write:%u keepalive:%u linger:%u/%u suspended:%u", ap_queue_info_num_idlers(worker_queue_info), apr_atomic_read32(&connection_count), apr_atomic_read32(waitio_q->total), @@ -712,8 +710,7 @@ static void disable_listensocks(void) apr_atomic_read32(keepalive_q->total), apr_atomic_read32(linger_q->total), apr_atomic_read32(short_linger_q->total), - apr_atomic_read32(&suspended_count), - apr_atomic_read32(&clogged_count)); + apr_atomic_read32(&suspended_count)); ap_scoreboard_image->parent[ap_child_slot].not_accepting = 1; @@ -732,8 +729,7 @@ static void enable_listensocks(void) ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(00457) "Resume listening sockets: idlers:%i conns:%u " - "waitio:%u write:%u keepalive:%u linger:%u/%u " - "suspended:%u clogged:%u", + "waitio:%u write:%u keepalive:%u linger:%u/%u suspended:%u", ap_queue_info_num_idlers(worker_queue_info), apr_atomic_read32(&connection_count), apr_atomic_read32(waitio_q->total), @@ -741,8 +737,7 @@ static void enable_listensocks(void) apr_atomic_read32(keepalive_q->total), apr_atomic_read32(linger_q->total), apr_atomic_read32(short_linger_q->total), - apr_atomic_read32(&suspended_count), - apr_atomic_read32(&clogged_count)); + apr_atomic_read32(&suspended_count)); /* * XXX: This is not yet optimal. If many workers suddenly become available, @@ -1415,7 +1410,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, { conn_rec *c = cs->c; long conn_id = ID_FROM_CHILD_THREAD(my_child_num, my_thread_num); - int rc = OK, processed = 0, clogging; + int rc = OK, processed = 0; if (!c) { /* This is a new connection */ cs->bucket_alloc = apr_bucket_alloc_create(p); @@ -1469,14 +1464,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, process_connection: processed = 1; cs->pub.state = CONN_STATE_PROCESSING; - clogging = c->clogging_input_filters; - if (clogging) { - apr_atomic_inc32(&clogged_count); - } rc = ap_run_process_connection(c); - if (clogging) { - apr_atomic_dec32(&clogged_count); - } /* * The process_connection hooks should set the appropriate connection * state upon return, for event MPM to either: @@ -2302,15 +2290,13 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) if (now - last_log > apr_time_from_sec(1)) { ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf, "connections: %u (waitio:%u write:%u keepalive:%u " - "lingering:%u suspended:%u clogged:%u), " - "workers: %u/%u shutdown", + "lingering:%u suspended:%u), workers: %u/%u shutdown", apr_atomic_read32(&connection_count), apr_atomic_read32(waitio_q->total), apr_atomic_read32(write_completion_q->total), apr_atomic_read32(keepalive_q->total), apr_atomic_read32(&lingering_count), apr_atomic_read32(&suspended_count), - apr_atomic_read32(&clogged_count), apr_atomic_read32(&threads_shutdown), threads_per_child); last_log = now; From f1367ba03edeaaa1ddd451b2561b69e20c976c13 Mon Sep 17 00:00:00 2001 From: ylavic Date: Tue, 1 Feb 2022 22:24:15 +0100 Subject: [PATCH 06/22] mpm_event: Use r->server's Timeout after the post_read_request hook. Regardless of keep_alive_timeout_set which anyway is only about the KeepAliveTimeout to apply _after_ the current request, always use the request's server Timeout during its processing (i.e. CONN_STATE_HEAR and CONN_STATE_COMPLETION). To save the next KeepAliveTimeout to use later, add a new event_srv_cfg to the conn_state which points to the appropriate server (either r->server or c->base_server depending on keep_alive_timeout_set as before). * server/mpm/event/event.c(struct event_conn_state_t): Add event_srv_cfg *ka_sc as the server config to apply for kept alive connections. * server/mpm/event/event.c(event_post_read_request): Always set cs->sc to the event_srv_cfg or the request's server, and point cs->ka_sc to the appropriate one according to keep_alive_timeout_set. * server/mpm/event/event.c(make_conn_state): Initialize cs->ka_sc to the ap_server_conf's event_srv_cfg, like cs->sc. * server/mpm/event/event.c(process_socket): Use cs->ka_sc->ka_q for CONN_STATE_KEEPALIVE. --- server/mpm/event/event.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 4e544ccdec9..601a23dd9f6 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -251,6 +251,8 @@ struct event_conn_state_t { request_rec *r; /** server config this struct refers to */ event_srv_cfg *sc; + /** server config this struct refers to during keepalive */ + event_srv_cfg *ka_sc; /** scoreboard handle for the conn_rec */ ap_sb_handle_t *sbh; /** bucket allocator */ @@ -1224,18 +1226,23 @@ static int event_post_read_request(request_rec *r) event_conn_state_t *cs = ap_get_module_config(c->conn_config, &mpm_event_module); + /* Use Timeout from the request's server. */ + cs->sc = ap_get_module_config(r->server->module_config, + &mpm_event_module); + /* To preserve legacy behaviour (consistent with other MPMs), use - * the keepalive timeout from the base server (first on this IP:port) - * when none is explicitly configured on this server. + * KeepaliveTimeout from the base server (first on this IP:port) + * when none is explicitly configured on this server. Otherwise + * use the one from the request's server. */ - if (r->server->keep_alive_timeout_set) { - cs->sc = ap_get_module_config(r->server->module_config, - &mpm_event_module); + if (!r->server->keep_alive_timeout_set) { + cs->ka_sc = ap_get_module_config(c->base_server->module_config, + &mpm_event_module); } else { - cs->sc = ap_get_module_config(c->base_server->module_config, - &mpm_event_module); + cs->ka_sc = cs->sc; } + return OK; } @@ -1352,8 +1359,8 @@ static event_conn_state_t *make_conn_state(apr_pool_t *p, apr_socket_t *csd) APR_RING_ELEM_INIT(cs, timeout_list); - cs->sc = ap_get_module_config(ap_server_conf->module_config, - &mpm_event_module); + cs->sc = cs->ka_sc = ap_get_module_config(ap_server_conf->module_config, + &mpm_event_module); /** * XXX If the platform does not have a usable way of bundling @@ -1594,7 +1601,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, */ notify_suspend(cs); - if (!pollset_add(cs, CONN_SENSE_WANT_READ, cs->sc->ka_q)) { + if (!pollset_add(cs, CONN_SENSE_WANT_READ, cs->ka_sc->ka_q)) { apr_table_setn(cs->c->notes, "short-lingering-close", "1"); cs->pub.state = CONN_STATE_LINGER; goto lingering_close; From 589d21a0cacf822d905f3c37632be102b243921f Mon Sep 17 00:00:00 2001 From: ylavic Date: Tue, 27 Jun 2023 03:26:56 +0200 Subject: [PATCH 07/22] mpm_event: Add kill_connection() to log (APLOG_INFO) interrupted connections. --- server/mpm/event/event.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 601a23dd9f6..b58fc50bd94 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -1154,6 +1154,25 @@ static void close_connection_at(event_conn_state_t *cs, #define close_connection(cs) \ close_connection_at((cs), __FUNCTION__, __LINE__) +static void kill_connection_at(event_conn_state_t *cs, apr_status_t status, + const char *at, int line) +{ + if (cs->c) { + ap_log_cerror(APLOG_MARK, APLOG_INFO, status, cs->c, APLOGNO(10382) + "killing connection in %s at %s:%i", + cs_state_str(cs), at, line); + } + else { + ap_log_error(APLOG_MARK, APLOG_INFO, status, ap_server_conf, APLOGNO(10383) + "killing unprocessed connection from %pI in %s at %s:%i", + cs_raddr(cs), cs_state_str(cs), at, line); + } + + close_connection_at(cs, at, line); +} +#define kill_connection(cs, status) \ + kill_connection_at((cs), (status), __FUNCTION__, __LINE__) + /* forward declare */ static void set_conn_state_sense(event_conn_state_t *cs, int sense); @@ -1787,7 +1806,7 @@ static apr_status_t push2worker(event_conn_state_t *cs, apr_socket_t *csd, * socket to a worker */ if (cs) { - shutdown_connection(cs); + kill_connection(cs, rc); } else { if (csd) { @@ -2178,7 +2197,7 @@ static void process_timeout_queue(struct timeout_queue *q, apr_time_t expiry, TO_QUEUE_REMOVE(qp, cs); if (!pollset_del(cs, 1)) { - shutdown_connection(cs); + kill_connection(cs, APR_EGENERAL); continue; } @@ -2468,7 +2487,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) } if (!pollset_del(cs, 0)) { - shutdown_connection(cs); + /* Can't go anywhere, kill (and log) and next. */ + kill_connection(cs, APR_EGENERAL); continue; } From 0ea6ae6162fbcdf4e659cacd16d07f93a61dbef7 Mon Sep 17 00:00:00 2001 From: ylavic Date: Tue, 1 Feb 2022 17:17:11 +0100 Subject: [PATCH 08/22] core,mod_reqtimeout: Add ap_get_connection_timeout(). Provide a new min_connection_timeout hook that modules enforcing a dynamic connection timeout (e.g. mod_reqtimeout) should use to inform ap_get_connection_timeout() users about the current timeout being applied. Expose the current timeout enforced by mod_reqtimeout by implementing the min_connection_timeout hook. * include/ap_mmn.h(): Minor bump for min_connection_timeout and ap_get_connection_timeout(). * include/http_connection.h(): Declare min_connection_timeout and ap_get_connection_timeout(). * server/connection.c(): Implement min_connection_timeout and ap_get_connection_timeout(). * modules/filters/mod_reqtimeout.c(struct reqtimeout_stage_t): Add server_timeout as the timeout defined for the server at the current stage. * modules/filters/mod_reqtimeout.c(struct reqtimeout_con_cfg): Add time_left as the dynamic timeout enforced by mod_reqtimeout at the current stage. * modules/filters/mod_reqtimeout.c(check_time_left): Store the computed time_left in the reqtimeout_con_cfg, and set the socket timeout there (returning an error which will be caught if that fails). * modules/filters/mod_reqtimeout.c(extend_timeout): Update time_left in the reqtimeout_con_cfg per the time taken by the last read. * modules/filters/mod_reqtimeout.c(reqtimeout_filter): Remove the special path for APR_NONBLOCK_READ or AP_MODE_EATCRLF, it does the exact same thing than the !(AP_MODE_GETLINE && APR_BLOCK_READ) one. * modules/filters/mod_reqtimeout.c(reqtimeout_init, reqtimeout_before_header, reqtimeout_before_body, INIT_STAGE): Set the server_timeout in the current stage. * modules/filters/mod_reqtimeout.c(reqtimeout_min_timeout): The new hook implementation. --- include/ap_mmn.h | 4 +- include/http_connection.h | 5 ++ modules/filters/mod_reqtimeout.c | 127 +++++++++++++++++++------------ server/connection.c | 16 ++++ 4 files changed, 101 insertions(+), 51 deletions(-) diff --git a/include/ap_mmn.h b/include/ap_mmn.h index acfa61e22b5..fb8f4512d47 100644 --- a/include/ap_mmn.h +++ b/include/ap_mmn.h @@ -733,6 +733,8 @@ * 20211221.25 (2.5.1-dev) AP_SLASHES and AP_IS_SLASH * 20211221.26 (2.5.1-dev) Add AGAIN, ap_check_input_pending() and * ap_check_output_pending() + * 20211221.27 (2.5.1-dev) Add min_connection_timeout hook and + * ap_get_connection_timeout() */ #define MODULE_MAGIC_COOKIE 0x41503235UL /* "AP25" */ @@ -740,7 +742,7 @@ #ifndef MODULE_MAGIC_NUMBER_MAJOR #define MODULE_MAGIC_NUMBER_MAJOR 20211221 #endif -#define MODULE_MAGIC_NUMBER_MINOR 26 /* 0...n */ +#define MODULE_MAGIC_NUMBER_MINOR 27 /* 0...n */ /** * Determine if the server's current MODULE_MAGIC_NUMBER is at least a diff --git a/include/http_connection.h b/include/http_connection.h index a89113bcb3b..601a4769109 100644 --- a/include/http_connection.h +++ b/include/http_connection.h @@ -196,6 +196,11 @@ AP_DECLARE(conn_rec *) ap_create_secondary_connection(apr_pool_t *pool, conn_rec *master, apr_bucket_alloc_t *alloc); +AP_DECLARE_HOOK(int, min_connection_timeout, + (conn_rec *c, server_rec *s, apr_interval_time_t *min_timeout)) + +AP_DECLARE(apr_interval_time_t) ap_get_connection_timeout(conn_rec *c, + server_rec *s); /** End Of Connection (EOC) bucket */ AP_DECLARE_DATA extern const apr_bucket_type_t ap_bucket_type_eoc; diff --git a/modules/filters/mod_reqtimeout.c b/modules/filters/mod_reqtimeout.c index 0e5afca57e4..693351e1280 100644 --- a/modules/filters/mod_reqtimeout.c +++ b/modules/filters/mod_reqtimeout.c @@ -45,6 +45,7 @@ typedef struct int max_timeout; /* max timeout in secs */ int min_rate; /* min rate in bytes/s */ apr_time_t rate_factor; /* scale factor (#usecs per min_rate) */ + apr_interval_time_t server_timeout; /* server timeout at this stage */ } reqtimeout_stage_t; typedef struct @@ -59,6 +60,7 @@ typedef struct { apr_time_t timeout_at; apr_time_t max_timeout_at; + apr_interval_time_t time_left; reqtimeout_stage_t cur_stage; int in_keep_alive; char *type; @@ -74,34 +76,45 @@ static int default_body_rate_factor; static void extend_timeout(reqtimeout_con_cfg *ccfg, apr_bucket_brigade *bb) { apr_off_t len; + apr_time_t old_timeout_at; apr_time_t new_timeout_at; if (apr_brigade_length(bb, 0, &len) != APR_SUCCESS || len <= 0) return; - new_timeout_at = ccfg->timeout_at + len * ccfg->cur_stage.rate_factor; + old_timeout_at = ccfg->timeout_at; + new_timeout_at = old_timeout_at + len * ccfg->cur_stage.rate_factor; if (ccfg->max_timeout_at > 0 && new_timeout_at > ccfg->max_timeout_at) { ccfg->timeout_at = ccfg->max_timeout_at; } else { ccfg->timeout_at = new_timeout_at; } + + ccfg->time_left += new_timeout_at - old_timeout_at; + if (ccfg->time_left > ccfg->cur_stage.server_timeout) { + ccfg->time_left = ccfg->cur_stage.server_timeout; + } } -static apr_status_t check_time_left(reqtimeout_con_cfg *ccfg, - apr_time_t *time_left_p, - apr_time_t now) +static apr_status_t check_and_update_time_left(reqtimeout_con_cfg *ccfg, + apr_time_t now) { if (!now) now = apr_time_now(); - *time_left_p = ccfg->timeout_at - now; - if (*time_left_p <= 0) + + ccfg->time_left = ccfg->timeout_at - now; + if (ccfg->time_left <= 0) return APR_TIMEUP; - if (*time_left_p < apr_time_from_sec(1)) { - *time_left_p = apr_time_from_sec(1); + if (ccfg->time_left < apr_time_from_sec(1)) { + ccfg->time_left = apr_time_from_sec(1); } - return APR_SUCCESS; + else if (ccfg->time_left > ccfg->cur_stage.server_timeout) { + ccfg->time_left = ccfg->cur_stage.server_timeout; + } + + return apr_socket_timeout_set(ccfg->socket, ccfg->time_left); } static apr_status_t have_lf_or_eos(apr_bucket_brigade *bb) @@ -168,16 +181,14 @@ static apr_status_t brigade_append(apr_bucket_brigade *bbOut, apr_bucket_brigade } -#define MIN(x,y) ((x) < (y) ? (x) : (y)) static apr_status_t reqtimeout_filter(ap_filter_t *f, apr_bucket_brigade *bb, ap_input_mode_t mode, apr_read_type_e block, apr_off_t readbytes) { - apr_time_t time_left; - apr_time_t now = 0; apr_status_t rv; + apr_time_t now = 0; apr_interval_time_t saved_sock_timeout = UNSET; reqtimeout_con_cfg *ccfg = f->ctx; @@ -198,11 +209,11 @@ static apr_status_t reqtimeout_filter(ap_filter_t *f, /* set new timeout */ now = apr_time_now(); ccfg->timeout_at = now + apr_time_from_sec(ccfg->cur_stage.timeout); - ccfg->cur_stage.timeout = 0; if (ccfg->cur_stage.max_timeout > 0) { ccfg->max_timeout_at = now + apr_time_from_sec(ccfg->cur_stage.max_timeout); ccfg->cur_stage.max_timeout = 0; } + ccfg->cur_stage.timeout = 0; } else if (ccfg->timeout_at == 0) { /* no timeout set, or in between requests */ @@ -213,39 +224,30 @@ static apr_status_t reqtimeout_filter(ap_filter_t *f, ccfg->socket = ap_get_conn_socket(f->c); } - rv = check_time_left(ccfg, &time_left, now); - if (rv != APR_SUCCESS) - goto out; - - if (block == APR_NONBLOCK_READ || mode == AP_MODE_EATCRLF) { - rv = ap_get_brigade(f->next, bb, mode, block, readbytes); - if (ccfg->cur_stage.rate_factor && rv == APR_SUCCESS) { - extend_timeout(ccfg, bb); - } - return rv; - } - rv = apr_socket_timeout_get(ccfg->socket, &saved_sock_timeout); AP_DEBUG_ASSERT(rv == APR_SUCCESS); - rv = apr_socket_timeout_set(ccfg->socket, MIN(time_left, saved_sock_timeout)); - AP_DEBUG_ASSERT(rv == APR_SUCCESS); + rv = check_and_update_time_left(ccfg, now); + if (rv != APR_SUCCESS) + goto cleanup; + + if (mode == AP_MODE_GETLINE && block == APR_BLOCK_READ) { + apr_off_t remaining = HUGE_STRING_LEN; +#if APR_MAJOR_VERSION < 2 + apr_int32_t nsds; + apr_interval_time_t poll_timeout; + apr_pollfd_t pollset; + pollset.p = NULL; +#endif - if (mode == AP_MODE_GETLINE) { /* * For a blocking AP_MODE_GETLINE read, apr_brigade_split_line() * would loop until a whole line has been read. As this would make it * impossible to enforce a total timeout, we only do non-blocking * reads. */ - apr_off_t remaining = HUGE_STRING_LEN; do { apr_off_t bblen; -#if APR_MAJOR_VERSION < 2 - apr_int32_t nsds; - apr_interval_time_t poll_timeout; - apr_pollfd_t pollset; -#endif rv = ap_get_brigade(f->next, bb, AP_MODE_GETLINE, APR_NONBLOCK_READ, remaining); if (rv != APR_SUCCESS && !APR_STATUS_IS_EAGAIN(rv)) { @@ -282,10 +284,12 @@ static apr_status_t reqtimeout_filter(ap_filter_t *f, /* ... and wait for more */ #if APR_MAJOR_VERSION < 2 - pollset.p = f->c->pool; - pollset.desc_type = APR_POLL_SOCKET; - pollset.reqevents = APR_POLLIN|APR_POLLHUP; - pollset.desc.s = ccfg->socket; + if (pollset.p == NULL) { + pollset.p = f->c->pool; + pollset.desc_type = APR_POLL_SOCKET; + pollset.reqevents = APR_POLLIN | APR_POLLHUP | APR_POLLERR; + pollset.desc.s = ccfg->socket; + } apr_socket_timeout_get(ccfg->socket, &poll_timeout); rv = apr_poll(&pollset, 1, &nsds, poll_timeout); #else @@ -294,14 +298,10 @@ static apr_status_t reqtimeout_filter(ap_filter_t *f, if (rv != APR_SUCCESS) break; - rv = check_time_left(ccfg, &time_left, 0); + rv = check_and_update_time_left(ccfg, 0); if (rv != APR_SUCCESS) break; - rv = apr_socket_timeout_set(ccfg->socket, - MIN(time_left, saved_sock_timeout)); - AP_DEBUG_ASSERT(rv == APR_SUCCESS); - } while (1); if (ccfg->tmpbb) @@ -310,19 +310,21 @@ static apr_status_t reqtimeout_filter(ap_filter_t *f, } else { /* mode != AP_MODE_GETLINE */ rv = ap_get_brigade(f->next, bb, mode, block, readbytes); + /* Don't extend the timeout in speculative mode, wait for * the real (relevant) bytes to be asked later, within the * currently allotted time. */ - if (ccfg->cur_stage.rate_factor && rv == APR_SUCCESS - && mode != AP_MODE_SPECULATIVE) { + if (rv == APR_SUCCESS + && mode != AP_MODE_SPECULATIVE + && ccfg->cur_stage.rate_factor) { extend_timeout(ccfg, bb); } } +cleanup: apr_socket_timeout_set(ccfg->socket, saved_sock_timeout); -out: if (APR_STATUS_IS_TIMEUP(rv)) { ap_log_cerror(APLOG_MARK, APLOG_INFO, 0, f->c, APLOGNO(01382) "Request %s read timeout", ccfg->type); @@ -353,7 +355,7 @@ static apr_status_t reqtimeout_eor(ap_filter_t *f, apr_bucket_brigade *bb) return ap_pass_brigade(f->next, bb); } -#define INIT_STAGE(cfg, ccfg, stage) do { \ +#define INIT_STAGE(cfg, ccfg, stage, s_timeout) do { \ if (cfg->stage.timeout != UNSET) { \ ccfg->cur_stage.timeout = cfg->stage.timeout; \ ccfg->cur_stage.max_timeout = cfg->stage.max_timeout; \ @@ -364,6 +366,8 @@ static apr_status_t reqtimeout_eor(ap_filter_t *f, apr_bucket_brigade *bb) ccfg->cur_stage.max_timeout = MRT_DEFAULT_##stage##_MAX_TIMEOUT; \ ccfg->cur_stage.rate_factor = default_##stage##_rate_factor; \ } \ + ccfg->cur_stage.server_timeout = s_timeout; \ + ccfg->time_left = ccfg->cur_stage.timeout; \ } while (0) static int reqtimeout_init(conn_rec *c) @@ -392,7 +396,7 @@ static int reqtimeout_init(conn_rec *c) ccfg->type = "handshake"; if (cfg->handshake.timeout > 0) { - INIT_STAGE(cfg, ccfg, handshake); + INIT_STAGE(cfg, ccfg, handshake, c->base_server->timeout); } } @@ -422,7 +426,7 @@ static void reqtimeout_before_header(request_rec *r, conn_rec *c) ccfg->timeout_at = 0; ccfg->max_timeout_at = 0; ccfg->in_keep_alive = (c->keepalives > 0); - INIT_STAGE(cfg, ccfg, header); + INIT_STAGE(cfg, ccfg, header, c->base_server->timeout); } static int reqtimeout_before_body(request_rec *r) @@ -447,11 +451,31 @@ static int reqtimeout_before_body(request_rec *r) ccfg->cur_stage.timeout = 0; } else { - INIT_STAGE(cfg, ccfg, body); + INIT_STAGE(cfg, ccfg, body, r->server->timeout); } return OK; } +static int reqtimeout_min_timeout(conn_rec *c, server_rec *s/*unused*/, + apr_interval_time_t *min_timeout) +{ + reqtimeout_con_cfg *ccfg = ap_get_module_config(c->conn_config, + &reqtimeout_module); + reqtimeout_stage_t *stage = &ccfg->cur_stage; + + if (stage->timeout > 0 || ccfg->timeout_at) { + if (ccfg->time_left <= 0) { + *min_timeout = 0; + } + else if (*min_timeout < 0 || *min_timeout > ccfg->time_left) { + *min_timeout = ccfg->time_left; + } + return OK; + } + + return DECLINED; +} + #define UNSET_STAGE(cfg, stage) do { \ cfg->stage.timeout = UNSET; \ cfg->stage.max_timeout = UNSET; \ @@ -637,6 +661,9 @@ static void reqtimeout_hooks(apr_pool_t *pool) ap_hook_post_read_request(reqtimeout_before_body, NULL, NULL, APR_HOOK_MIDDLE); + ap_hook_min_connection_timeout(reqtimeout_min_timeout, NULL, NULL, + APR_HOOK_MIDDLE); + #if MRT_DEFAULT_handshake_MIN_RATE default_handshake_rate_factor = apr_time_from_sec(1) / MRT_DEFAULT_handshake_MIN_RATE; diff --git a/server/connection.c b/server/connection.c index f32a1f3712c..a1c4c1860f0 100644 --- a/server/connection.c +++ b/server/connection.c @@ -36,6 +36,7 @@ APR_HOOK_STRUCT( APR_HOOK_LINK(pre_connection) APR_HOOK_LINK(pre_close_connection) APR_HOOK_LINK(create_secondary_connection) + APR_HOOK_LINK(min_connection_timeout) ) AP_IMPLEMENT_HOOK_RUN_FIRST(conn_rec *,create_connection, (apr_pool_t *p, server_rec *server, apr_socket_t *csd, long conn_id, void *sbh, apr_bucket_alloc_t *alloc), @@ -46,6 +47,9 @@ AP_IMPLEMENT_HOOK_RUN_ALL(int,pre_close_connection,(conn_rec *c),(c),OK,DECLINED AP_IMPLEMENT_HOOK_RUN_FIRST(conn_rec *,create_secondary_connection, (apr_pool_t *p, conn_rec *master, apr_bucket_alloc_t *alloc), (p, master, alloc), NULL) +AP_IMPLEMENT_HOOK_RUN_ALL(int,min_connection_timeout, + (conn_rec *c, server_rec *s, apr_interval_time_t *min_timeout), + (c, s, min_timeout),OK,DECLINED) AP_DECLARE(conn_rec *) ap_create_connection(apr_pool_t *p, server_rec *server, @@ -251,3 +255,15 @@ AP_CORE_DECLARE(void) ap_process_connection(conn_rec *c, void *csd) ap_run_process_connection(c); } } + +AP_DECLARE(apr_interval_time_t) ap_get_connection_timeout(conn_rec *c, + server_rec *s) +{ + apr_interval_time_t timeout = -1; + + if (ap_run_min_connection_timeout(c, s, &timeout) != OK || timeout < 0) { + timeout = (s) ? s->timeout : c->base_server->timeout; + } + + return timeout; +} From 8bddc079c906fb556f6507026d32a6c3b1dcaae7 Mon Sep 17 00:00:00 2001 From: ylavic Date: Tue, 1 Feb 2022 17:25:48 +0100 Subject: [PATCH 09/22] mpm_event: Use ap_get_connection_timeout() for CONN_STATE_ASYNC_WAITIO. If ap_run_process_connection() returns CONN_STATE_ASYNC_WAITIO and the connection timeout as returned by ap_get_connection_timeout() is different than the waitio_q timeout, use a timer event rather than the waitio_q to keep track of the idle connection. * server/mpm_fdqueue.h(truct timer_event_t): Add the "timeout" field to store the timeout of the timer, recomputing it from "when" would require to call apr_time_now() otherwise. * server/mpm/event/event.c(): #define TIMER_MIN_TIMEOUT as the minimal timer event's timeout, to prevent the events from firing before the sockets are added to the pollset. Currently set to 50ms (an arbitrary value..). * server/mpm/event/event.c(struct event_conn_state_t): Add the timer_event_t *te field as an alternative to the q. * server/mpm/event/event.c(struct event_srv_cfg_s): Add the server_rec *s field to backref the server_rec and easily pass cs->sc->s to ap_get_connection_timeout(). * server/mpm/event/event.c(pollset_add_at, pollset_del_at): If the connection is attached to a timer event, log a "t" instead of a "q" and the timer's timeout instead of the q's. * server/mpm/event/event.c(process_socket): If ap_get_connection_timeout() is different than the waitio_q timeout, acquire a timer event and associate it with the conn_state. A timer event associated with a conn_state has a NULL callback (cbfn). * server/mpm/event/event.c(event_get_timer_event): Set the given timeout to the ->timeout field. * server/mpm/event/event.c(event_register_timed_callback_ex, event_register_poll_callback_ex): Return APR_EINVAL if the given callbacks are NULL, this is reserved for conn_state timers now. Since it would have crashed at some point to pass NULL callbacks before, it's not really an API change. * server/mpm/event/event.c(listener_thread): Fix the poll() timeout set from timers_next_expiry which should be taken into account whether it expired or not. When a conn_state timer fires/expires, remove it from the pollset and abort the connection (with APLOG_INFO). When a conn_state timer is polled, cancel the timer. --- server/mpm/event/event.c | 82 ++++++++++++++++++++++++++++++---------- server/mpm_fdqueue.h | 1 + 2 files changed, 64 insertions(+), 19 deletions(-) diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index b58fc50bd94..8ea061140c3 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -267,6 +267,8 @@ struct event_conn_state_t { apr_time_t queue_timestamp; /** the timeout queue for this entry */ struct timeout_queue *q; + /** the timer event for this entry */ + timer_event_t *te; /* * when queued to workers @@ -646,6 +648,7 @@ struct event_srv_cfg_s { struct timeout_queue *io_q, *wc_q, *ka_q; + server_rec *s; /* backref */ }; #define ID_FROM_CHILD_THREAD(c, t) ((c * thread_limit) + t) @@ -1266,7 +1269,7 @@ static int event_post_read_request(request_rec *r) } static int pollset_add_at(event_conn_state_t *cs, int sense, - struct timeout_queue *q, + struct timeout_queue *q, timer_event_t *te, const char *at, int line) { apr_status_t rv; @@ -1275,11 +1278,11 @@ static int pollset_add_at(event_conn_state_t *cs, int sense, "pollset: add %s=%" APR_TIME_T_FMT " events=%x" " for connection %" CS_FMT " at %s:%i", (q) ? "q" : "t", - (q) ? q->timeout : -1, + (q) ? q->timeout : (te) ? te->timeout : -1, (int)cs->pfd.reqevents, CS_ARG(cs), at, line); - ap_assert(cs->q == NULL && q != NULL); + ap_assert(cs->q == NULL && cs->te == NULL && ((q != NULL) ^ (te != NULL))); set_conn_state_sense(cs, sense); @@ -1287,12 +1290,20 @@ static int pollset_add_at(event_conn_state_t *cs, int sense, apr_thread_mutex_lock(timeout_mutex); TO_QUEUE_APPEND(q, cs); } + else { + cs->te = te; + } + rv = apr_pollset_add(event_pollset, &cs->pfd); if (rv != APR_SUCCESS) { if (q) { TO_QUEUE_REMOVE(q, cs); apr_thread_mutex_unlock(timeout_mutex); } + else { + te->canceled = 1; + cs->te = NULL; + } /* close_worker_sockets() may have closed it already */ if (workers_may_exit) { @@ -1312,8 +1323,8 @@ static int pollset_add_at(event_conn_state_t *cs, int sense, } return 1; } -#define pollset_add(cs, sense, q) \ - pollset_add_at((cs), (sense), (q), __FUNCTION__, __LINE__) +#define pollset_add(cs, sense, q, te) \ + pollset_add_at((cs), (sense), (q), (te), __FUNCTION__, __LINE__) static int pollset_del_at(event_conn_state_t *cs, int locked, const char *at, int line) @@ -1324,11 +1335,11 @@ static int pollset_del_at(event_conn_state_t *cs, int locked, "pollset: del %s=%" APR_TIME_T_FMT " events=%x" " for connection %" CS_FMT " at %s:%i", (cs->q) ? "q" : "t", - (cs->q) ? cs->q->timeout : -1, + (cs->q) ? cs->q->timeout : (cs->te ? cs->te->timeout : -1), (int)cs->pfd.reqevents, CS_ARG(cs), at, line); - ap_assert(cs->q != NULL); + ap_assert((cs->q != NULL) ^ (cs->te != NULL)); if (cs->q) { if (!locked) { @@ -1339,6 +1350,10 @@ static int pollset_del_at(event_conn_state_t *cs, int locked, apr_thread_mutex_unlock(timeout_mutex); } } + else { + cs->te->canceled = 1; + cs->te = NULL; + } /* * Some of the pollset backends, like KQueue or Epoll @@ -1362,6 +1377,10 @@ static int pollset_del_at(event_conn_state_t *cs, int locked, pollset_del_at((cs), (locked), __FUNCTION__, __LINE__) /* Forward declare */ +static timer_event_t *get_timer_event(apr_time_t timeout, + ap_mpm_callback_fn_t *cbfn, void *baton, + int insert, + apr_array_header_t *pfds); static void process_lingering_close(event_conn_state_t *cs); static event_conn_state_t *make_conn_state(apr_pool_t *p, apr_socket_t *csd) @@ -1547,16 +1566,32 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, } if (cs->pub.state == CONN_STATE_ASYNC_WAITIO) { + apr_interval_time_t timeout; + struct timeout_queue *q = NULL; + timer_event_t *te = NULL; + /* Set a read/write timeout for this connection, and let the * event thread poll for read/writeability. */ ap_update_child_status(cs->sbh, SERVER_BUSY_READ, NULL); notify_suspend(cs); - /* Modules might set c->cs->sense to CONN_SENSE_WANT_WRITE, - * the default is CONN_SENSE_WANT_READ still. + /* If the connection timeout is actually different than the waitio_q's, + * use a timer event to honor it (e.g. mod_reqtimeout may enforce its + * own timeouts per request stage). */ - if (pollset_add(cs, CONN_SENSE_WANT_READ, cs->sc->io_q)) { + timeout = ap_get_connection_timeout(c, cs->sc->s); + if (timeout >= 0 && timeout != cs->sc->io_q->timeout) { + /* Prevent the timer from firing before the pollset is updated */ + if (timeout < TIMERS_FUDGE_TIMEOUT) { + timeout = TIMERS_FUDGE_TIMEOUT; + } + te = get_timer_event(timeout, NULL, cs, 1, NULL); + } + else { + q = cs->sc->io_q; + } + if (!pollset_add(cs, CONN_SENSE_WANT_READ, q, te)) { apr_table_setn(cs->c->notes, "short-lingering-close", "1"); cs->pub.state = CONN_STATE_LINGER; goto lingering_close; @@ -1583,7 +1618,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, /* Let the event thread poll for write */ notify_suspend(cs); cs->pub.sense = CONN_SENSE_DEFAULT; - if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q)) { + if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q, NULL)) { return; /* queued */ } /* Fall through lingering close */ @@ -1620,7 +1655,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, */ notify_suspend(cs); - if (!pollset_add(cs, CONN_SENSE_WANT_READ, cs->ka_sc->ka_q)) { + if (!pollset_add(cs, CONN_SENSE_WANT_READ, cs->ka_sc->ka_q, NULL)) { apr_table_setn(cs->c->notes, "short-lingering-close", "1"); cs->pub.state = CONN_STATE_LINGER; goto lingering_close; @@ -1661,7 +1696,7 @@ static apr_status_t event_resume_suspended (conn_rec *c) cs->pub.sense = CONN_SENSE_DEFAULT; if (!CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) { cs->pub.state = CONN_STATE_WRITE_COMPLETION; - if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q)) { + if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q, NULL)) { return APR_SUCCESS; /* queued */ } @@ -1884,8 +1919,7 @@ static int timer_comp(void *a, void *b) static apr_thread_mutex_t *g_timer_skiplist_mtx; static timer_event_t *get_timer_event(apr_time_t timeout, - ap_mpm_callback_fn_t *cbfn, - void *baton, + ap_mpm_callback_fn_t *cbfn, void *baton, int insert, apr_array_header_t *pfds) { @@ -1909,6 +1943,7 @@ static timer_event_t *get_timer_event(apr_time_t timeout, te->cbfunc = cbfn; te->baton = baton; te->when = now + timeout; + te->timeout = timeout; te->pfds = pfds; if (insert) { @@ -2141,7 +2176,7 @@ static void process_lingering_close(event_conn_state_t *cs) struct timeout_queue *q; /* (Re)queue the connection to come back when readable */ q = (cs->pub.state == CONN_STATE_LINGER_SHORT) ? short_linger_q : linger_q; - if (pollset_add(cs, CONN_SENSE_WANT_READ, q)) { + if (pollset_add(cs, CONN_SENSE_WANT_READ, q, NULL)) { return; /* queued */ } } @@ -2195,7 +2230,6 @@ static void process_timeout_queue(struct timeout_queue *q, apr_time_t expiry, break; } - TO_QUEUE_REMOVE(qp, cs); if (!pollset_del(cs, 1)) { kill_connection(cs, APR_EGENERAL); continue; @@ -2353,8 +2387,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) apr_thread_mutex_lock(g_timer_skiplist_mtx); while ((te = apr_skiplist_peek(timer_skiplist))) { if (te->when > now) { - timers_next_expiry = te->when; - timeout = te->when - now; break; } apr_skiplist_pop(timer_skiplist, NULL); @@ -2364,6 +2396,17 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) continue; } + if (!te->cbfunc) { + cs = te->baton; + put_timer_event(te, 1); + ap_assert(cs && cs->te == te); + ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, + "timed out connection %" CS_FMT, CS_ARG(cs)); + (void)pollset_del(cs, 0); + kill_connection(cs, APR_TIMEUP); + continue; + } + if (te->pfds) { /* remove all sockets from the pollset */ apr_pool_cleanup_run(te->pfds->pool, te->pfds, @@ -4417,6 +4460,7 @@ static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog, for (; s; s = s->next) { event_srv_cfg *sc = apr_pcalloc(pconf, sizeof *sc); ap_set_module_config(s->module_config, &mpm_event_module, sc); + sc->s = s; /* backref */ sc->io_q = TO_QUEUE_CHAIN(pconf, "waitio", s->timeout, &waitio_q, io_h, ptemp); diff --git a/server/mpm_fdqueue.h b/server/mpm_fdqueue.h index 0dd558b938a..260e22ab80e 100644 --- a/server/mpm_fdqueue.h +++ b/server/mpm_fdqueue.h @@ -70,6 +70,7 @@ struct timer_event_t void *baton; int canceled; apr_array_header_t *pfds; + apr_interval_time_t timeout; }; typedef struct timer_event_t timer_event_t; From ae9a3b90f96939e38f2f92a73e34180b83e41c8a Mon Sep 17 00:00:00 2001 From: ylavic Date: Mon, 3 Jun 2024 16:42:51 +0200 Subject: [PATCH 10/22] mpm_fdqueue: Allow to queue any events (socket, timer, opaque), and use that for mpm_event's backlog queue. --- include/scoreboard.h | 1 + modules/lua/lua_request.c | 4 + server/mpm/event/event.c | 891 +++++++++++++++++++------------------ server/mpm/worker/worker.c | 9 +- server/mpm_fdqueue.c | 580 +++++++++++++++--------- server/mpm_fdqueue.h | 92 ++-- 6 files changed, 884 insertions(+), 693 deletions(-) diff --git a/include/scoreboard.h b/include/scoreboard.h index 25d19f03538..e83e52fdb16 100644 --- a/include/scoreboard.h +++ b/include/scoreboard.h @@ -149,6 +149,7 @@ struct process_score { apr_uint32_t keep_alive; /* async connections in keep alive */ apr_uint32_t suspended; /* connections suspended by some module */ apr_uint32_t wait_io; /* async connections waiting an IO in the MPM */ + apr_uint32_t backlog; /* async connections waiting for a worker */ }; /* Scoreboard is now in 'local' memory, since it isn't updated once created, diff --git a/modules/lua/lua_request.c b/modules/lua/lua_request.c index 6787bbfaf7f..5fa3a968c6b 100644 --- a/modules/lua/lua_request.c +++ b/modules/lua/lua_request.c @@ -1248,6 +1248,10 @@ static int lua_ap_scoreboard_process(lua_State *L) lua_pushnumber(L, ps_record->connections); lua_settable(L, -3); + lua_pushstring(L, "backlog"); + lua_pushnumber(L, ps_record->backlog); + lua_settable(L, -3); + lua_pushstring(L, "keepalive"); lua_pushnumber(L, ps_record->keep_alive); lua_settable(L, -3); diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 8ea061140c3..5a9f4b676b4 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -203,6 +203,7 @@ static volatile int workers_may_exit = 0; static volatile int start_thread_may_exit = 0; static volatile int listener_may_exit = 0; static apr_uint32_t connection_count = 0; /* Number of open connections */ +static apr_uint32_t timers_count = 0; /* Number of queued timers */ static apr_uint32_t lingering_count = 0; /* Number of connections in lingering close */ static apr_uint32_t suspended_count = 0; /* Number of suspended connections */ static apr_uint32_t threads_shutdown = 0; /* Number of threads that have shutdown @@ -236,6 +237,14 @@ static apr_thread_mutex_t *timeout_mutex; * XXX: cases. */ static apr_pollset_t *event_pollset; +#define POLLSET_RESERVE_SIZE 10000 + +struct backlog_timer_event { + timer_event_t te; + ap_queue_event_t qe; +}; +#define te_qe(te) (&((struct backlog_timer_event *)(te))->qe) +#define te_in_backlog(te) (te_qe(te)->cb != NULL) typedef struct event_conn_state_t event_conn_state_t; struct event_conn_state_t { @@ -273,8 +282,12 @@ struct event_conn_state_t { /* * when queued to workers */ - /** chaining in defer_linger_chain */ - struct event_conn_state_t *chain; + /** the backlog event for this entry */ + struct backlog_socket_event { + sock_event_t se; + ap_queue_event_t qe; + struct timeout_queue *q; + } bse; /* * bools as bits @@ -290,6 +303,9 @@ struct event_conn_state_t { /** Has ap_start_lingering_close() been called? */ linger_started :1; }; +#define cs_se(cs) (&(cs)->bse.se) +#define cs_qe(cs) (&(cs)->bse.qe) +#define cs_in_backlog(cs) (cs_qe(cs)->cb != NULL) static APR_INLINE apr_socket_t *cs_sd(event_conn_state_t *cs) { @@ -336,12 +352,6 @@ static APR_INLINE const char *cs_state_str(event_conn_state_t *cs) #define CS_FMT_TO CS_FMT " to [%pI]" #define CS_ARG_TO(cs) CS_ARG(cs), cs_raddr(cs) -/* - * The chain of connections to be shutdown by a worker thread (deferred), - * linked list updated atomically. - */ -static event_conn_state_t *volatile defer_linger_chain; - #define USE_CLOCK_COARSE 0 /* not for now */ #if HAVE_CLOCK_GETTIME && defined(CLOCK_MONOTONIC) /* POSIX */ static clockid_t event_clockid; @@ -447,14 +457,15 @@ struct timeout_queue { * keepalive_q uses vhost's KeepAliveTimeOut * linger_q uses MAX_SECS_TO_LINGER * short_linger_q uses SECONDS_TO_LINGER + * backlog_q uses vhost's TimeOut */ static struct timeout_queue *waitio_q, /* wait for I/O to happen */ *write_completion_q, /* completion or user async poll */ *keepalive_q, /* in between requests */ *linger_q, /* lingering (read) before close */ - *short_linger_q; /* lingering (read) before close (short timeout) */ - -static volatile apr_time_t queues_next_expiry; /* next expiry time accross all queues */ + *short_linger_q, /* lingering (read) before close (short timeout) */ + *backlog_q; /* waiting for a worker */ +static volatile apr_time_t queues_next_expiry; /* next expiry time accross all queues */ /* * Macros for accessing struct timeout_queue. @@ -584,7 +595,6 @@ typedef struct socket_callback_baton apr_array_header_t *pfds; timer_event_t *cancel_event; /* If a timeout was requested, a pointer to the timer event */ struct socket_callback_baton *next; - unsigned int signaled :1; } socket_callback_baton_t; typedef struct event_child_bucket { @@ -647,7 +657,8 @@ struct event_srv_cfg_s { /* Per server timeout queues */ struct timeout_queue *io_q, *wc_q, - *ka_q; + *ka_q, + *bl_q; server_rec *s; /* backref */ }; @@ -696,25 +707,34 @@ static int ap_child_slot; /* Current child process slot in scoreboard */ */ static apr_socket_t **worker_sockets; -static volatile apr_uint32_t listensocks_disabled; +/* Disabling / enabling listening sockets can only happen in the listener + * thread, which is the only one to set 'dying' to 1 too, so it's all thread + * safe. 'listensocks_off' is changed atomically still because it's read + * concurrently in listensocks_disabled(). + */ +static /*atomic*/ apr_uint32_t listensocks_off = 0; -static void disable_listensocks(void) +static int disable_listensocks(void) { int i; - if (apr_atomic_cas32(&listensocks_disabled, 1, 0) != 0) { - return; + + if (apr_atomic_cas32(&listensocks_off, 1, 0) != 0) { + return 0; } ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(10381) - "Suspend listening sockets: idlers:%i conns:%u " - "waitio:%u write:%u keepalive:%u linger:%u/%u suspended:%u", - ap_queue_info_num_idlers(worker_queue_info), + "Suspend listening sockets: idlers:%i conns:%u backlog:%u " + "waitio:%u write:%u keepalive:%u linger:%u/%u " + "timers:%u suspended:%u", + ap_queue_info_idlers_count(worker_queue_info), apr_atomic_read32(&connection_count), + apr_atomic_read32(backlog_q->total), apr_atomic_read32(waitio_q->total), apr_atomic_read32(write_completion_q->total), apr_atomic_read32(keepalive_q->total), apr_atomic_read32(linger_q->total), apr_atomic_read32(short_linger_q->total), + apr_atomic_read32(&timers_count), apr_atomic_read32(&suspended_count)); ap_scoreboard_image->parent[ap_child_slot].not_accepting = 1; @@ -722,26 +742,31 @@ static void disable_listensocks(void) for (i = 0; i < num_listensocks; i++) { apr_pollset_remove(event_pollset, &listener_pollfd[i]); } + return 1; } -static void enable_listensocks(void) +static int enable_listensocks(void) { int i; + if (listener_may_exit - || apr_atomic_cas32(&listensocks_disabled, 0, 1) != 1) { - return; + || apr_atomic_cas32(&listensocks_off, 0, 1) != 1) { + return 0; } ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(00457) - "Resume listening sockets: idlers:%i conns:%u " - "waitio:%u write:%u keepalive:%u linger:%u/%u suspended:%u", - ap_queue_info_num_idlers(worker_queue_info), + "Resume listening sockets: idlers:%i conns:%u backlog:%u " + "waitio:%u write:%u keepalive:%u linger:%u/%u " + "timers:%u suspended:%u", + ap_queue_info_idlers_count(worker_queue_info), apr_atomic_read32(&connection_count), + apr_atomic_read32(backlog_q->total), apr_atomic_read32(waitio_q->total), apr_atomic_read32(write_completion_q->total), apr_atomic_read32(keepalive_q->total), apr_atomic_read32(linger_q->total), apr_atomic_read32(short_linger_q->total), + apr_atomic_read32(&timers_count), apr_atomic_read32(&suspended_count)); /* @@ -753,23 +778,24 @@ static void enable_listensocks(void) for (i = 0; i < num_listensocks; i++) { apr_pollset_add(event_pollset, &listener_pollfd[i]); } + return 1; } -static APR_INLINE apr_uint32_t listeners_disabled(void) +static APR_INLINE int listensocks_disabled(void) { - return apr_atomic_read32(&listensocks_disabled); + return apr_atomic_read32(&listensocks_off) != 0; } static APR_INLINE int connections_above_limit(int *busy) { - apr_uint32_t i_count = ap_queue_info_num_idlers(worker_queue_info); + apr_int32_t i_count = ap_queue_info_idlers_count(worker_queue_info); if (i_count > 0) { apr_uint32_t c_count = apr_atomic_read32(&connection_count); apr_uint32_t l_count = apr_atomic_read32(&lingering_count); if (c_count <= l_count - /* Off by 'listeners_disabled()' to avoid flip flop */ + /* Off by 'listensocks_disabled()' to avoid flip flop */ || c_count - l_count < (apr_uint32_t)threads_per_child + - (i_count - listeners_disabled()) * + (i_count - listensocks_disabled()) * (worker_factor / WORKER_FACTOR_SCALE)) { return 0; } @@ -782,7 +808,7 @@ static APR_INLINE int connections_above_limit(int *busy) static APR_INLINE int should_enable_listensocks(void) { - return !dying && listeners_disabled() && !connections_above_limit(NULL); + return !dying && listensocks_disabled() && !connections_above_limit(NULL); } static void close_socket_at(apr_socket_t *csd, @@ -1101,36 +1127,6 @@ static void notify_resume(event_conn_state_t *cs, int cleanup) ap_run_resume_connection(cs->c, cs->r); } -/* - * Defer flush and close of the connection by adding it to defer_linger_chain, - * for a worker to grab it and do the job (should that be blocking). - * Pre-condition: nonblocking, can be called from anywhere provided cs is not - * in any timeout queue or in the pollset. - */ -static int defer_lingering_close(event_conn_state_t *cs) -{ - ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, - "deferring close for connection %" CS_FMT, CS_ARG(cs)); - - /* The connection is not shutdown() yet strictly speaking, but it's not - * in any queue nor handled by a worker either (will be very soon), so - * to account for it somewhere we bump lingering_count now (and set - * deferred_linger for process_lingering_close() to know). - */ - cs->pub.state = CONN_STATE_LINGER; - apr_atomic_inc32(&lingering_count); - cs->deferred_linger = 1; - for (;;) { - event_conn_state_t *chain = cs->chain = defer_linger_chain; - if (apr_atomic_casptr((void *)&defer_linger_chain, cs, - chain) != chain) { - /* Race lost, try again */ - continue; - } - return 1; - } -} - /* Close the connection and release its resources (ptrans), either because an * unrecoverable error occured (queues or pollset add/remove) or more usually * if lingering close timed out. @@ -1178,23 +1174,53 @@ static void kill_connection_at(event_conn_state_t *cs, apr_status_t status, /* forward declare */ static void set_conn_state_sense(event_conn_state_t *cs, int sense); +static void push2worker(event_conn_state_t *cs, timer_event_t *te, + apr_time_t now, int *busy); /* Shutdown the connection in case of timeout, error or resources shortage. * This starts short lingering close if not already there, or directly closes * the connection otherwise. * Pre-condition: nonblocking, can be called from anywhere provided cs is not - * in any timeout queue or in the pollset. + * in the pollset nor any non-backlog timeout queue. */ -static int shutdown_connection(event_conn_state_t *cs) +static void shutdown_connection(event_conn_state_t *cs, apr_time_t now, + int in_backlog) { - if (!CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) { - apr_table_setn(cs->c->notes, "short-lingering-close", "1"); - defer_lingering_close(cs); + ap_assert(!cs->q && !cs->te); + + if (cs->c) { + int log_level = APLOG_INFO; + switch (cs->pub.state) { + case CONN_STATE_LINGER: + case CONN_STATE_LINGER_NORMAL: + case CONN_STATE_LINGER_SHORT: + case CONN_STATE_KEEPALIVE: + log_level = APLOG_TRACE2; + default: + break; + } + ap_log_cerror(APLOG_MARK, log_level, 0, cs->c, APLOGNO(10380) + "shutting down %s connection in %s", + in_backlog ? "backlog" : "timed out", + cs_state_str(cs)); + + /* Don't re-schedule connections in lingering close, they had + * their chance already so just close them now. + */ + if (!CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) { + apr_table_setn(cs->c->notes, "short-lingering-close", "1"); + cs->pub.state = CONN_STATE_LINGER; + push2worker(cs, NULL, now, NULL); + } + else { + close_connection(cs); + } } else { - close_connection(cs); + /* Never been scheduled/processed, kill it. */ + ap_assert(in_backlog); + kill_connection(cs, APR_EBUSY); } - return 1; } /* @@ -1388,10 +1414,13 @@ static event_conn_state_t *make_conn_state(apr_pool_t *p, apr_socket_t *csd) event_conn_state_t *cs = apr_pcalloc(p, sizeof(*cs)); listener_poll_type *pt; - cs->p = p; - cs->pfd.desc.s = csd; cs->pfd.desc_type = APR_POLL_SOCKET; + cs->pfd.desc.s = cs_se(cs)->sd = csd; cs->pfd.client_data = pt = apr_pcalloc(p, sizeof(*pt)); + cs_qe(cs)->cb_baton = cs_se(cs)->baton = cs; + cs_qe(cs)->type = AP_QUEUE_EVENT_SOCK; + cs_qe(cs)->data.se = cs_se(cs); + cs->p = cs_se(cs)->p = p; pt->type = PT_CSD; pt->baton = cs; @@ -1814,85 +1843,128 @@ static void init_serf(apr_pool_t *p) } #endif -static apr_status_t push_timer2worker(timer_event_t* te) +/* A backlog connection is both in the worker_queue (for a worker to pull + * it ASAP) and in the backlog_q (for the listener to enforce a timeout). + * The worker_queue can do the queuing on both queues for us, that is + * consistently and safely push/pop to/from both queues under its lock, + * thanks to a callback called when an event is pushed and popped. + */ +static void conn_state_backlog_cb(void *baton, int pushed) { - return ap_queue_push_timer(worker_queue, te); + event_conn_state_t *cs = baton; + + if (pushed) { + TO_QUEUE_APPEND(cs->sc->bl_q, cs); + } + else { /* popped */ + TO_QUEUE_REMOVE(cs->sc->bl_q, cs); + + /* not in backlog anymore */ + cs_qe(cs)->cb = NULL; + } } -/* - * Pre-condition: cs is neither in event_pollset nor a timeout queue - * this function may only be called by the listener - */ -static apr_status_t push2worker(event_conn_state_t *cs, apr_socket_t *csd, - apr_pool_t *ptrans) +static void timer_event_backlog_cb(void *baton, int pushed) { - apr_status_t rc; + timer_event_t *te = baton; + ap_assert(te && te_qe(te)); - if (cs) { - ptrans = cs->p; - csd = cs_sd(cs); + if (pushed) { + apr_atomic_inc32(&timers_count); } + else { /* popped */ + apr_atomic_dec32(&timers_count); - rc = ap_queue_push_socket(worker_queue, csd, cs, ptrans); - if (rc != APR_SUCCESS) { - ap_log_error(APLOG_MARK, APLOG_CRIT, rc, ap_server_conf, APLOGNO(00471) - "push2worker: ap_queue_push_socket failed"); - /* trash the connection; we couldn't queue the connected - * socket to a worker - */ - if (cs) { - kill_connection(cs, rc); - } - else { - if (csd) { - close_socket(csd); - } - if (ptrans) { - ap_queue_info_push_pool(worker_queue_info, ptrans); - } - } - signal_threads(ST_GRACEFUL); + /* not in backlog anymore */ + te_qe(te)->cb = NULL; } - - return rc; } -/* get_worker: - * If *have_idle_worker_p == 0, reserve a worker thread, and set - * *have_idle_worker_p = 1. - * If *have_idle_worker_p is already 1, will do nothing. - * If blocking == 1, block if all workers are currently busy. - * If no worker was available immediately, will set *all_busy to 1. - * XXX: If there are no workers, we should not block immediately but - * XXX: close all keep-alive connections first. +/* + * Pre-condition: cs is neither in event_pollset nor a queue + * this function may only be called by the listener */ -static void get_worker(int *have_idle_worker_p, int blocking, int *all_busy) +static void push2worker(event_conn_state_t *cs, timer_event_t *te, + apr_time_t now, int *above_limit) { + ap_queue_event_t *qe; apr_status_t rc; + int busy; + + ap_assert((cs != NULL) ^ (te != NULL)); - if (*have_idle_worker_p) { - /* already reserved a worker thread - must have hit a - * transient error on a previous pass + busy = (ap_queue_info_idlers_dec(worker_queue_info) < 0); + if (busy) { + /* Might need to kindle the fire by not accepting new connections until + * the situation settles down. The listener and new idling workers will + * test for should_enable_listensocks() to recover (when suitable). */ - return; + if (connections_above_limit(NULL)) { + disable_listensocks(); + if (above_limit) { + *above_limit = 1; + } + } } - if (blocking) - rc = ap_queue_info_wait_for_idler(worker_queue_info, all_busy); - else - rc = ap_queue_info_try_get_idler(worker_queue_info); + if (te) { + ap_assert(!te_in_backlog(te)); - if (rc == APR_SUCCESS || APR_STATUS_IS_EOF(rc)) { - *have_idle_worker_p = 1; - } - else if (!blocking && rc == APR_EAGAIN) { - *all_busy = 1; + qe = te_qe(te); + qe->cb = timer_event_backlog_cb; } else { - ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf, APLOGNO(00472) - "ap_queue_info_wait_for_idler failed. " - "Attempting to shutdown process gracefully"); - signal_threads(ST_GRACEFUL); + ap_assert(!cs_in_backlog(cs)); + ap_assert(!cs->q); + + if (busy && cs->pub.state == CONN_STATE_LINGER && cs->linger_started) { + /* Not worth lingering more on this connection if we are short of + * workers and everything is flushed+shutdown already, back out + * and close. + */ + ap_queue_info_idlers_inc(worker_queue_info); + close_connection(cs); + return; + } + + if (cs->c) { + ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, + "pushing connection %" CS_FMT, + CS_ARG(cs)); + } + else { + ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf, + "pushing connection %" CS_FMT_TO, + CS_ARG_TO(cs)); + } + + qe = cs_qe(cs); + qe->cb = conn_state_backlog_cb; + } + + rc = ap_queue_push_event(worker_queue, qe); + if (rc != APR_SUCCESS) { + int mode = ST_GRACEFUL; + + ap_queue_info_idlers_inc(worker_queue_info); + + ap_log_error(APLOG_MARK, APLOG_CRIT, rc, ap_server_conf, APLOGNO(00471) + "push2worker: queuing %s failed", cs ? "socket" : "timer"); + + if (cs) { + /* Can't go anywhere, kill (and log). */ + kill_connection(cs, rc); + } + else { + /* Can't call te->cbfunc() and potentially block there, someone is + * going to miss this event thus never release their connection(s), + * graceful stop could never complete. + */ + mode = ST_UNGRACEFUL; + } + + AP_DEBUG_ASSERT(0); + signal_threads(mode); } } @@ -1935,8 +2007,13 @@ static timer_event_t *get_timer_event(apr_time_t timeout, APR_RING_REMOVE(te, link); } else { - te = apr_skiplist_alloc(timer_skiplist, sizeof(timer_event_t)); - memset(te, 0, sizeof(*te)); + struct backlog_timer_event *bte; + /* invariant: (te == &bte->te) => (te_qe(te) == &bte->qe) */ + bte = apr_skiplist_alloc(timer_skiplist, sizeof(*bte)); + memset(bte, 0, sizeof(*bte)); + bte->qe.type = AP_QUEUE_EVENT_TIMER; + bte->qe.data.te = bte->qe.cb_baton = &bte->te; + te = &bte->te; } APR_RING_ELEM_INIT(te, link); @@ -2123,14 +2200,11 @@ static void process_lingering_close(event_conn_state_t *cs) CS_ARG(cs)); AP_DEBUG_ASSERT(CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)); + /* Flush and shutdown first */ if (!cs->linger_started) { + cs->linger_started = 1; /* once! */ + apr_atomic_inc32(&lingering_count); cs->pub.state = CONN_STATE_LINGER; - cs->linger_started = 1; - - /* defer_lingering_close() may have bumped lingering_count already */ - if (!cs->deferred_linger) { - apr_atomic_inc32(&lingering_count); - } apr_socket_timeout_set(csd, apr_time_from_sec(SECONDS_TO_LINGER)); if (ap_start_lingering_close(cs->c)) { @@ -2157,24 +2231,17 @@ static void process_lingering_close(event_conn_state_t *cs) cs->pub.state = CONN_STATE_LINGER_NORMAL; } cs->pub.sense = CONN_SENSE_DEFAULT; - - /* One timestamp/duration for the whole lingering close time. - * XXX: This makes the (short_)linger_q not sorted/ordered by expiring - * timeouts whenever multiple schedules are necessary (EAGAIN below), - * but we probabaly don't care since these connections do not count - * for connections_above_limit() and all of them will be killed when - * busy or gracefully stopping anyway. - */ - cs->queue_timestamp = apr_time_now(); } + /* Drain until EAGAIN or EOF/error, in the former case requeue and + * come back when readable again, otherwise the connection is over. + */ do { apr_size_t nbytes = sizeof(dummybuf); rv = apr_socket_recv(csd, dummybuf, &nbytes); } while (rv == APR_SUCCESS); if (APR_STATUS_IS_EAGAIN(rv)) { struct timeout_queue *q; - /* (Re)queue the connection to come back when readable */ q = (cs->pub.state == CONN_STATE_LINGER_SHORT) ? short_linger_q : linger_q; if (pollset_add(cs, CONN_SENSE_WANT_READ, q, NULL)) { return; /* queued */ @@ -2183,108 +2250,92 @@ static void process_lingering_close(event_conn_state_t *cs) close_connection(cs); } -/* call 'func' for all elements of 'q' above 'expiry'. +/* Call shutdown_connection() for the elements of 'q' that timed out, or + * for all if 'shrink' is set. * Pre-condition: timeout_mutex must already be locked - * Post-condition: timeout_mutex will be locked again */ -static void process_timeout_queue(struct timeout_queue *q, apr_time_t expiry, - int (*func)(event_conn_state_t *)) +static unsigned int process_timeout_queue_ex(struct timeout_queue *queue, + apr_time_t now, + int shrink) { - apr_uint32_t total = 0, count; - event_conn_state_t *first, *cs, *last; - struct event_conn_state_t trash; - struct timeout_queue *qp; + unsigned int count = 0; + struct timeout_queue *q; - if (!*q->total) { - return; + if (!*queue->total) { + return 0; } - APR_RING_INIT(&trash.timeout_list, event_conn_state_t, timeout_list); - for (qp = q; qp; qp = qp->next) { - count = 0; - cs = first = last = APR_RING_FIRST(&qp->head); - while (cs != APR_RING_SENTINEL(&qp->head, event_conn_state_t, - timeout_list)) { - /* Trash the entry if: - * - no expiry was given (zero means all), or - * - it expired (according to the queue timeout), or - * - the system clock skewed in the past: no entry should be - * registered above the given expiry (~now) + the queue - * timeout, we won't keep any here (eg. for centuries). - * - * Otherwise stop, no following entry will match thanks to the - * single timeout per queue (entries are added to the end!). - * This allows maintenance in O(1). - */ - if (expiry && cs->queue_timestamp + qp->timeout > expiry - && cs->queue_timestamp < expiry + qp->timeout) { - /* Since this is the next expiring entry of this queue, update - * the global queues_next_expiry if it's later than this one. + for (q = queue; q; q = q->next) { + while (!APR_RING_EMPTY(&q->head, event_conn_state_t, timeout_list)) { + event_conn_state_t *cs = APR_RING_FIRST(&q->head); + + ap_assert(cs->q == q); + + if (!shrink) { + /* Stop if this entry did not expire, no following one will + * thanks to the single timeout per queue (latest entries are + * added to the tail). */ - apr_time_t elem_expiry = cs->queue_timestamp + qp->timeout; - apr_time_t next_expiry = queues_next_expiry; - if (!next_expiry + apr_time_t elem_expiry = cs->queue_timestamp + q->timeout; + if (elem_expiry > now) { + /* This is the next expiring entry of this queue, update + * the global queues_next_expiry if it expires after + * this one. + */ + apr_time_t next_expiry = queues_next_expiry; + if (!next_expiry || next_expiry > elem_expiry + QUEUES_FUDGE_TIMEOUT) { - queues_next_expiry = elem_expiry; + queues_next_expiry = elem_expiry; + } + break; } - break; } - if (!pollset_del(cs, 1)) { - kill_connection(cs, APR_EGENERAL); - continue; + if (cs_in_backlog(cs)) { + /* Remove the backlog connection from worker_queue (note that + * the lock is held by the listener already when maintaining + * the backlog_q), and unreserve/set a worker/idler since + * none could handle the event. + */ + ap_assert(cs_qe(cs)->cb_baton == cs); + ap_assert(cs->q == cs->sc->bl_q); + ap_queue_info_idlers_inc(worker_queue_info); + ap_queue_kill_event_locked(worker_queue, cs_qe(cs)); + shutdown_connection(cs, now, 1); } - - if (cs == first) { - APR_RING_INSERT_HEAD(&qp->head, cs, event_conn_state_t, - timeout_list); + else if (pollset_del(cs, 1)) { + /* Removed from the pollset and timeout queue. */ + shutdown_connection(cs, now, 0); } else { - APR_RING_INSERT_AFTER(last, cs, timeout_list); + /* Can't go anywhere, kill (and log). */ + kill_connection(cs, APR_EGENERAL); } - ++*qp->total; - ++qp->count; - last = cs; - cs = APR_RING_NEXT(cs, timeout_list); count++; } - if (!count) - continue; - - APR_RING_UNSPLICE(first, last, timeout_list); - APR_RING_SPLICE_TAIL(&trash.timeout_list, first, last, event_conn_state_t, - timeout_list); - AP_DEBUG_ASSERT(*q->total >= count && qp->count >= count); - *q->total -= count; - qp->count -= count; - total += count; } - if (!total) - return; - apr_thread_mutex_unlock(timeout_mutex); - first = APR_RING_FIRST(&trash.timeout_list); - do { - cs = APR_RING_NEXT(first, timeout_list); - APR_RING_ELEM_INIT(cs, timeout_list); - func(first); - first = cs; - } while (--total); - apr_thread_mutex_lock(timeout_mutex); + return count; } -static void process_keepalive_queue(apr_time_t expiry) +static APR_INLINE void process_timeout_queue(struct timeout_queue *queue, + apr_time_t now) { - /* If all workers are busy, we kill older keep-alive connections so - * that they may connect to another process. - */ - if (!expiry && *keepalive_q->total) { + (void)process_timeout_queue_ex(queue, now, 0); +} + +/* When all workers are busy or dying, kill'em all \m/ */ +static APR_INLINE void shrink_timeout_queue(struct timeout_queue *queue, + apr_time_t now) +{ + unsigned int count = process_timeout_queue_ex(queue, now, 1); + if (count) { ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, - "All workers are busy or dying, will shutdown %u " - "keep-alive connections", *keepalive_q->total); + "All workers are %s, %s queue shrinked (%u done, %u left)", + dying ? "dying" : "busy", queue->name, + count, apr_atomic_read32(queue->total)); } - process_timeout_queue(keepalive_q, expiry, shutdown_connection); } static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) @@ -2293,7 +2344,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) proc_info *ti = dummy; int process_slot = ti->pslot; process_score *ps = ap_get_scoreboard_process(process_slot); - int have_idle_worker = 0; apr_time_t last_log; last_log = event_time_now(); @@ -2316,7 +2366,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) int workers_were_busy = 0; socket_callback_baton_t *user_chain; const apr_pollfd_t *out_pfd; - apr_time_t now; + apr_time_t now, poll_time; event_conn_state_t *cs; timer_event_t *te; @@ -2325,6 +2375,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) check_infinite_requests(); } + now = poll_time = event_time_now(); + if (listener_may_exit) { int once = !dying; if (once) { @@ -2332,7 +2384,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) } if (terminate_mode == ST_UNGRACEFUL - || apr_atomic_read32(&connection_count) == 0) + || (apr_atomic_read32(&connection_count) == 0 + && apr_atomic_read32(&timers_count) == 0)) break; if (once) { @@ -2345,7 +2398,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) } if (APLOGtrace6(ap_server_conf)) { - now = event_time_now(); /* trace log status every second */ if (now - last_log > apr_time_from_sec(1)) { ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf, @@ -2376,7 +2428,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) * up occurs, otherwise periodic checks (maintenance, shutdown, ...) * must be performed. */ - now = event_time_now(); timeout = -1; /* Push expired timers to a worker, the first remaining one (if any) @@ -2401,7 +2452,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) put_timer_event(te, 1); ap_assert(cs && cs->te == te); ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, - "timed out connection %" CS_FMT, CS_ARG(cs)); + "timed out connection %" CS_FMT, + CS_ARG(cs)); (void)pollset_del(cs, 0); kill_connection(cs, APR_TIMEUP); continue; @@ -2412,7 +2464,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) apr_pool_cleanup_run(te->pfds->pool, te->pfds, event_cleanup_poll_callback); } - push_timer2worker(te); + push2worker(NULL, te, now, &workers_were_busy); } if (te) { next_expiry = te->when; @@ -2453,13 +2505,14 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) } ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf, - "pollset: wait for timeout=%" APR_TIME_T_FMT + "pollset: wait timeout=%" APR_TIME_T_FMT " queues_timeout=%" APR_TIME_T_FMT " timers_timeout=%" APR_TIME_T_FMT - " conns=%d exit=%d/%d", + " listen=%s conns=%d exit=%d/%d", timeout, - queues_next_expiry ? queues_next_expiry - now : -1, - timers_next_expiry ? timers_next_expiry - now : -1, + queues_next_expiry ? queues_next_expiry - now : 0, + timers_next_expiry ? timers_next_expiry - now : 0, + listensocks_disabled() ? "no" : "yes", apr_atomic_read32(&connection_count), listener_may_exit, dying); @@ -2476,34 +2529,36 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) num = 0; } - if (APLOGtrace7(ap_server_conf)) { - apr_time_t old_now = now; - now = event_time_now(); - - ap_log_error(APLOG_MARK, APLOG_TRACE7, rc, ap_server_conf, - "pollset: have #%i time=%" APR_TIME_T_FMT "/%" APR_TIME_T_FMT - " queues_timeout=%" APR_TIME_T_FMT - " timers_timeout=%" APR_TIME_T_FMT - " conns=%d exit=%d/%d", - (int)num, now - old_now, timeout, - queues_next_expiry ? queues_next_expiry - now : -1, - timers_next_expiry ? timers_next_expiry - now : -1, - apr_atomic_read32(&connection_count), - listener_may_exit, dying); - } - - /* XXX possible optimization: stash the current time for use as - * r->request_time for new requests or queues maintenance + /* Update "now" after polling and use it for everything below (all + * non-(indefinitely-)blocking code). "now - poll_time" is then the + * time passed in poll(). + * + * XXX possible optimization: stash this time for use as + * r->request_time for new requests. */ + now = event_time_now(); + + ap_log_error(APLOG_MARK, APLOG_TRACE7, rc, ap_server_conf, + "pollset: have num=%i" + " elapsed=%" APR_TIME_T_FMT "/%" APR_TIME_T_FMT + " queues_timeout=%" APR_TIME_T_FMT + " timers_timeout=%" APR_TIME_T_FMT + " listen=%s conns=%d exit=%d/%d", + (int)num, now - poll_time, timeout, + queues_next_expiry ? queues_next_expiry - now : 0, + timers_next_expiry ? timers_next_expiry - now : 0, + listensocks_disabled() ? "no" : "yes", + apr_atomic_read32(&connection_count), + listener_may_exit, dying); for (user_chain = NULL; num > 0; --num, ++out_pfd) { listener_poll_type *pt = out_pfd->client_data; + socket_callback_baton_t *baton; - if (pt->type == PT_CSD) { - /* one of the sockets is readable */ - int blocking = 1; - - cs = (event_conn_state_t *) pt->baton; + switch (pt->type) { + case PT_CSD: + /* one of the sockets is ready */ + cs = (event_conn_state_t *)pt->baton; ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, "polled connection %" CS_FMT, CS_ARG(cs)); @@ -2513,12 +2568,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) case CONN_STATE_ASYNC_WAITIO: cs->pub.state = CONN_STATE_PROCESSING; case CONN_STATE_WRITE_COMPLETION: - break; - case CONN_STATE_LINGER_NORMAL: case CONN_STATE_LINGER_SHORT: - /* don't wait for a worker for lingering close processing. */ - blocking = 0; break; default: @@ -2529,53 +2580,29 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) ap_assert(0); } - if (!pollset_del(cs, 0)) { + if (pollset_del(cs, 0)) { + push2worker(cs, NULL, now, &workers_were_busy); + } + else { /* Can't go anywhere, kill (and log) and next. */ kill_connection(cs, APR_EGENERAL); - continue; } + break; - { - /* If we don't get a worker immediately (nonblocking), we - * close the connection; the client can re-connect to a - * different process for keepalive, and for lingering close - * the connection will be shutdown so the choice is to favor - * incoming/alive connections. - */ - get_worker(&have_idle_worker, blocking, - &workers_were_busy); - if (!have_idle_worker) { - shutdown_connection(cs); - } - else if (push2worker(cs, NULL, NULL) == APR_SUCCESS) { - have_idle_worker = 0; - } - } - } - else if (pt->type == PT_ACCEPT && !listeners_disabled()) { + case PT_ACCEPT: /* A Listener Socket is ready for an accept() */ if (workers_were_busy) { - disable_listensocks(); - ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, - APLOGNO(03268) - "All workers busy, not accepting new conns " - "in this process"); - } - else if (connections_above_limit(&workers_were_busy)) { - disable_listensocks(); - ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, - APLOGNO(03269) - "Too many open connections (%u, idlers %u), " - "not accepting new conns in this process", - apr_atomic_read32(&connection_count), - ap_queue_info_num_idlers(worker_queue_info)); + /* Listeners disabled for now, keep the new connection in + * the socket backlog until listening again. + */ + continue; } - else if (!listener_may_exit) { + if (!dying) { void *csd = NULL; ap_listen_rec *lr = (ap_listen_rec *) pt->baton; apr_pool_t *ptrans; /* Pool for per-transaction stuff */ - ap_queue_info_pop_pool(worker_queue_info, &ptrans); + ptrans = ap_queue_info_pop_pool(worker_queue_info); if (ptrans == NULL) { /* create a new transaction pool for each accepted socket */ apr_allocator_t *allocator = NULL; @@ -2604,25 +2631,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) } } - get_worker(&have_idle_worker, 1, &workers_were_busy); rc = lr->accept_func(&csd, lr, ptrans); - - /* later we trash rv and rely on csd to indicate - * success/failure - */ - AP_DEBUG_ASSERT(rc == APR_SUCCESS || !csd); - - if (rc == APR_EGENERAL) { - /* E[NM]FILE, ENOMEM, etc */ - resource_shortage = 1; - signal_threads(ST_GRACEFUL); - } - else if (ap_accept_error_is_nonfatal(rc)) { - ap_log_error(APLOG_MARK, APLOG_DEBUG, rc, ap_server_conf, - "accept() on client socket failed"); - } - - if (csd != NULL) { + if (rc == APR_SUCCESS) { conns_this_child--; /* Create and account for the connection from here, or @@ -2630,40 +2640,45 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) * would consider it does not exist and could exit the * child too early. */ + ap_assert(csd != NULL); cs = make_conn_state(ptrans, csd); - if (push2worker(cs, NULL, NULL) == APR_SUCCESS) { - have_idle_worker = 0; - } + push2worker(cs, NULL, now, &workers_were_busy); } else { + if (rc == APR_EGENERAL) { + /* E[NM]FILE, ENOMEM, etc */ + resource_shortage = 1; + signal_threads(ST_GRACEFUL); + } + else if (ap_accept_error_is_nonfatal(rc)) { + ap_log_error(APLOG_MARK, APLOG_DEBUG, rc, ap_server_conf, + "accept() on client socket failed"); + } ap_queue_info_push_pool(worker_queue_info, ptrans); } } - } /* if:else on pt->type */ + break; + #if HAVE_SERF - else if (pt->type == PT_SERF) { + case PT_SERF: /* send socket to serf. */ - /* XXXX: this doesn't require get_worker() */ + /* XXXX: this doesn't require a worker thread */ serf_event_trigger(g_serf, pt->baton, out_pfd); - } - + break; #endif - else if (pt->type == PT_USER) { - socket_callback_baton_t *baton = pt->baton; - if (baton->cancel_event) { - baton->cancel_event->canceled = 1; - } - /* We only signal once per N sockets with this baton, - * and after this loop to avoid any race/lifetime issue - * with the user callback being called while we handle - * the same baton multiple times here. + case PT_USER: + /* Multiple pfds of the same baton might trigger in this pass + * so chain once here and run the cleanup only after this loop + * to avoid lifetime issues (i.e. pfds->pool cleared while some + * of its pfd->client_data are still to be dereferenced here). */ - if (!baton->signaled) { - baton->signaled = 1; + baton = pt->baton; + if (baton != user_chain && !baton->next) { baton->next = user_chain; user_chain = baton; } + break; } } /* for processing poll */ @@ -2673,6 +2688,12 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) user_chain = user_chain->next; baton->next = NULL; + /* Not expirable anymore */ + if (baton->cancel_event) { + baton->cancel_event->canceled = 1; + baton->cancel_event = NULL; + } + /* remove all sockets from the pollset */ apr_pool_cleanup_run(baton->pfds->pool, baton->pfds, event_cleanup_poll_callback); @@ -2683,7 +2704,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) baton->user_baton, 0, /* don't insert it */ NULL /* no associated socket callback */); - push_timer2worker(te); + push2worker(NULL, te, now, &workers_were_busy); } /* We process the timeout queues here only when the global @@ -2692,10 +2713,13 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) * while latest ones are only taken into account here (in listener) * during queues' processing, with the lock held. This works both * with and without wake-ability. + * Even if "now" drifted a bit since it was fetched and the real + * "now" went below "expiry" in the meantime, the next poll() will + * return immediately so the maintenance will happen then. */ next_expiry = queues_next_expiry; + if (next_expiry && next_expiry <= now) { do_maintenance: - if (next_expiry && next_expiry <= (now = event_time_now())) { ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf, "queues maintenance: expired=%" APR_TIME_T_FMT, next_expiry > 0 ? now - next_expiry : -1); @@ -2705,29 +2729,39 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) /* Recompute this by walking the timeout queues (under the lock) */ queues_next_expiry = 0; - /* Step 1: keepalive queue timeouts are closed */ + /* Step 1: keepalive queue timeouts */ if (workers_were_busy || dying) { - process_keepalive_queue(0); /* kill'em all \m/ */ + shrink_timeout_queue(keepalive_q, now); } else { - process_keepalive_queue(now); + process_timeout_queue(keepalive_q, now); } - /* Step 2: waitio queue timeouts are flushed */ - process_timeout_queue(waitio_q, now, defer_lingering_close); + /* Step 2: waitio queue timeouts */ + process_timeout_queue(waitio_q, now); - /* Step 3: write completion queue timeouts are flushed */ - process_timeout_queue(write_completion_q, now, defer_lingering_close); + /* Step 3: write completion queue timeouts */ + process_timeout_queue(write_completion_q, now); - /* Step 4: normal lingering close queue timeouts are closed */ + /* Step 4: normal lingering close queue timeouts */ if (dying && linger_q->timeout > short_linger_q->timeout) { /* Dying, force short timeout for normal lingering close */ linger_q->timeout = short_linger_q->timeout; } - process_timeout_queue(linger_q, now, shutdown_connection); + process_timeout_queue(linger_q, now); - /* Step 5: short lingering close queue timeouts are closed */ - process_timeout_queue(short_linger_q, now, shutdown_connection); + /* Step 5: short lingering close queue timeouts */ + process_timeout_queue(short_linger_q, now); + + /* Step 6: backlog queue timeouts + * Connections in backlog race with the workers (dequeuing) under + * the worker_queue mutex. + */ + if (apr_atomic_read32(backlog_q->total)) { + ap_queue_lock(worker_queue); + process_timeout_queue(backlog_q, now); + ap_queue_unlock(worker_queue); + } next_expiry = queues_next_expiry; apr_thread_mutex_unlock(timeout_mutex); @@ -2740,34 +2774,17 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) ps->write_completion = apr_atomic_read32(write_completion_q->total); ps->keep_alive = apr_atomic_read32(keepalive_q->total); ps->lingering_close = apr_atomic_read32(&lingering_count); + ps->backlog = apr_atomic_read32(backlog_q->total); ps->suspended = apr_atomic_read32(&suspended_count); ps->connections = apr_atomic_read32(&connection_count); } else if ((workers_were_busy || dying) && apr_atomic_read32(keepalive_q->total)) { apr_thread_mutex_lock(timeout_mutex); - process_keepalive_queue(0); /* kill'em all \m/ */ + shrink_timeout_queue(keepalive_q, now); apr_thread_mutex_unlock(timeout_mutex); ps->keep_alive = 0; } - - /* If there are some lingering closes to defer (to a worker), schedule - * them now. We might wakeup a worker spuriously if another one empties - * defer_linger_chain in the meantime, but there also may be no active - * or all busy workers for an undefined time. In any case a deferred - * lingering close can't starve if we do that here since the chain is - * filled only above in the listener and it's emptied only in the - * worker(s); thus a NULL here means it will stay so while the listener - * waits (possibly indefinitely) in poll(). - */ - if (defer_linger_chain) { - get_worker(&have_idle_worker, 0, &workers_were_busy); - if (have_idle_worker - && defer_linger_chain /* re-test */ - && push2worker(NULL, NULL, NULL) == APR_SUCCESS) { - have_idle_worker = 0; - } - } } /* listener main loop */ ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf, @@ -2822,8 +2839,8 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy) int process_slot = ti->pslot; int thread_slot = ti->tslot; worker_score *ws = &ap_scoreboard_image->servers[process_slot][thread_slot]; + int is_idler = 0; apr_status_t rv; - int is_idle = 0; free(ti); @@ -2834,26 +2851,14 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy) SERVER_STARTING, NULL); for (;;) { - apr_socket_t *csd = NULL; - event_conn_state_t *cs = NULL; - timer_event_t *te = NULL; - apr_pool_t *ptrans; /* Pool for per-transaction stuff */ + ap_queue_event_t *qe; - if (!is_idle) { - rv = ap_queue_info_set_idle(worker_queue_info, NULL); - if (rv != APR_SUCCESS) { - ap_log_error(APLOG_MARK, APLOG_EMERG, rv, ap_server_conf, - APLOGNO(03270) - "ap_queue_info_set_idle failed. Attempting to " - "shutdown process gracefully."); - signal_threads(ST_GRACEFUL); - break; - } + if (!is_idler) { + int idlers = ap_queue_info_idlers_inc(worker_queue_info); ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf, "worker thread %i/%i idle (idlers %i)", - thread_slot, threads_per_child, - ap_queue_info_num_idlers(worker_queue_info)); - is_idle = 1; + thread_slot, threads_per_child, idlers); + is_idler = 1; /* If the listening sockets are paused and this new idler switches * connections_above_limit() back, let the listener know and poll @@ -2879,9 +2884,7 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy) break; } - rv = ap_queue_pop_something(worker_queue, &csd, (void **)&cs, - &ptrans, &te); - + rv = ap_queue_pop_event(worker_queue, &qe); if (rv != APR_SUCCESS) { /* We get APR_EOF during a graceful shutdown once all the * connections accepted by this server process have been handled. @@ -2893,12 +2896,12 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy) break; } - /* We get APR_EINTR whenever ap_queue_pop_*() has been interrupted - * from an explicit call to ap_queue_interrupt_all(). This allows - * us to unblock threads stuck in ap_queue_pop_*() when a shutdown - * is pending. + /* We get APR_EINTR whenever ap_queue_pop_event() has been + * interrupted from an explicit call to ap_queue_interrupt_*(). + * This allows us to unblock threads stuck in ap_queue_pop_event() + * when a shutdown is pending. * - * If workers_may_exit is set and this is ungraceful termination/ + * If workers_may_exit is set and this is ungraceful stop or * restart, we are bound to get an error on some systems (e.g., * AIX, which sanity-checks mutex operations) since the queue * may have already been cleaned up. Don't log the "error" if @@ -2906,59 +2909,60 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy) */ if (!APR_STATUS_IS_EINTR(rv) && !workers_may_exit) { ap_log_error(APLOG_MARK, APLOG_CRIT, rv, ap_server_conf, - APLOGNO(03099) "ap_queue_pop_something failed"); + APLOGNO(03099) "ap_queue_pop_event failed"); AP_DEBUG_ASSERT(0); signal_threads(ST_GRACEFUL); } continue; } + is_idler = 0; /* event consumed */ ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf, "worker thread %i/%i busy (idlers %i)", thread_slot, threads_per_child, - ap_queue_info_num_idlers(worker_queue_info)); + ap_queue_info_idlers_count(worker_queue_info)); + + if (qe->type == AP_QUEUE_EVENT_SOCK) { + apr_pool_t *p; + apr_socket_t *csd; + event_conn_state_t *cs; + + ap_assert(qe->data.se); + p = qe->data.se->p; + csd = qe->data.se->sd; + cs = qe->data.se->baton; + ap_assert(p && csd && cs && qe == cs_qe(cs)); + + worker_sockets[thread_slot] = csd; + process_socket(thd, p, csd, cs, process_slot, thread_slot); + worker_sockets[thread_slot] = NULL; + } + else if (qe->type == AP_QUEUE_EVENT_TIMER) { + timer_event_t *te; + ap_mpm_callback_fn_t *cbfunc; + void *baton; + + te = qe->data.te; + ap_assert(te && qe == te_qe(te)); + + cbfunc = te->cbfunc; + baton = te->baton; - if (te != NULL) { - void *baton = te->baton; - ap_mpm_callback_fn_t *cbfunc = te->cbfunc; /* first recycle the timer event */ put_timer_event(te, 0); + + ap_update_child_status_from_indexes(process_slot, thread_slot, + SERVER_BUSY_WRITE, NULL); + ap_assert(cbfunc != NULL); cbfunc(baton); } else { - is_idle = 0; /* consumed */ - if (csd != NULL) { - worker_sockets[thread_slot] = csd; - process_socket(thd, ptrans, csd, cs, process_slot, thread_slot); - worker_sockets[thread_slot] = NULL; - } - } - - /* If there are deferred lingering closes, handle them now. */ - while (!workers_may_exit) { - cs = defer_linger_chain; - if (!cs) { - break; - } - if (apr_atomic_casptr((void *)&defer_linger_chain, cs->chain, - cs) != cs) { - /* Race lost, try again */ - continue; - } - cs->chain = NULL; - AP_DEBUG_ASSERT(cs->pub.state == CONN_STATE_LINGER); - - ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, - "deferred close for connection %" CS_FMT, CS_ARG(cs)); - - worker_sockets[thread_slot] = csd = cs_sd(cs); - process_socket(thd, cs->p, csd, cs, process_slot, thread_slot); - worker_sockets[thread_slot] = NULL; + ap_assert(0); } } - if (is_idle) { + if (is_idler) { /* Not idling anymore */ - ap_queue_info_wait_for_idler(worker_queue_info, NULL); + ap_queue_info_idlers_dec(worker_queue_info); } ap_update_child_status_from_indexes(process_slot, thread_slot, @@ -3011,10 +3015,10 @@ static void setup_threads_runtime(void) APR_POLLSET_KQUEUE, APR_POLLSET_EPOLL }; /* XXX: K-A or lingering close connection included in the async factor */ - const unsigned int threads_factor = worker_factor / WORKER_FACTOR_SCALE; - const apr_size_t pollset_size = ((unsigned int)num_listensocks + - (unsigned int)threads_per_child * - (threads_factor > 2 ? threads_factor : 2)); + unsigned int async_factor = (worker_factor < WORKER_FACTOR_SCALE * 2 + ? WORKER_FACTOR_SCALE * 2 : worker_factor); + unsigned int async_threads = (threads_per_child * async_factor / WORKER_FACTOR_SCALE); + const apr_size_t pollset_size = (num_listensocks + async_threads + POLLSET_RESERVE_SIZE); int pollset_flags; /* Event's skiplist operations will happen concurrently with other modules' @@ -3046,8 +3050,8 @@ static void setup_threads_runtime(void) apr_pool_tag(pruntime, "mpm_runtime"); /* We must create the fd queues before we start up the listener - * and worker threads. */ - rv = ap_queue_create(&worker_queue, threads_per_child, pruntime); + * and worker threads, it's bounded by connections_above_limit(). */ + rv = ap_queue_create(&worker_queue, -1, pruntime); if (rv != APR_SUCCESS) { ap_log_error(APLOG_MARK, APLOG_ALERT, rv, ap_server_conf, APLOGNO(03100) "ap_queue_create() failed"); @@ -3061,8 +3065,7 @@ static void setup_threads_runtime(void) */ max_recycled_pools = threads_per_child * 3 / 4 ; } - rv = ap_queue_info_create(&worker_queue_info, pruntime, - threads_per_child, max_recycled_pools); + rv = ap_queue_info_create(&worker_queue_info, pruntime, max_recycled_pools); if (rv != APR_SUCCESS) { ap_log_error(APLOG_MARK, APLOG_ALERT, rv, ap_server_conf, APLOGNO(03101) "ap_queue_info_create() failed"); @@ -3639,6 +3642,7 @@ static void perform_idle_server_maintenance(void) int max_daemon_used = 0; int idle_thread_count = 0; int active_thread_count = 0; + int backlog_count = 0; int i, j; for (i = 0; i < server_limit; ++i) { @@ -3682,6 +3686,7 @@ static void perform_idle_server_maintenance(void) } } active_thread_count += child_threads_active; + backlog_count += apr_atomic_read32(&ps->backlog); if (child_threads_active == threads_per_child) { had_healthy_child = 1; } @@ -3855,10 +3860,10 @@ static void perform_idle_server_maintenance(void) retained->max_daemon_used = max_daemon_used; if (APLOGdebug(ap_server_conf)) { ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, - "score: idlers:%d, " + "score: idlers:%d backlog:%d, " "threads active:%d/%d max:%d, " "daemons active:%d/%d max:%d used:%d/%d/%d", - idle_thread_count, + idle_thread_count, backlog_count, active_thread_count, retained->active_daemons * threads_per_child, max_workers, retained->active_daemons, retained->total_daemons, active_daemons_limit, max_daemon_used, retained->max_daemon_used, @@ -4425,14 +4430,12 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog, active_daemons_limit = server_limit; threads_per_child = DEFAULT_THREADS_PER_CHILD; max_workers = active_daemons_limit * threads_per_child; - defer_linger_chain = NULL; had_healthy_child = 0; ap_extended_status = 0; event_pollset = NULL; worker_queue_info = NULL; listener_os_thread = NULL; - listensocks_disabled = 0; listener_is_wakeable = 0; return OK; @@ -4441,7 +4444,7 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog, static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog, apr_pool_t *ptemp, server_rec *s) { - apr_hash_t *io_h, *wc_h, *ka_h; + apr_hash_t *io_h, *wc_h, *ka_h, *bl_h; /* Not needed in pre_config stage */ if (ap_state_query(AP_SQ_MAIN_STATE) == AP_SQ_MS_CREATE_PRE_CONFIG) { @@ -4451,6 +4454,7 @@ static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog, io_h = apr_hash_make(ptemp); wc_h = apr_hash_make(ptemp); ka_h = apr_hash_make(ptemp); + bl_h = apr_hash_make(ptemp); linger_q = TO_QUEUE_MAKE(pconf, "linger", apr_time_from_sec(MAX_SECS_TO_LINGER), NULL); @@ -4470,6 +4474,9 @@ static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog, sc->ka_q = TO_QUEUE_CHAIN(pconf, "keepalive", s->keep_alive_timeout, &keepalive_q, ka_h, ptemp); + + sc->bl_q = TO_QUEUE_CHAIN(pconf, "backlog", s->timeout, + &backlog_q, bl_h, ptemp); } return OK; diff --git a/server/mpm/worker/worker.c b/server/mpm/worker/worker.c index 42b81a8ed1b..1fff5b085e6 100644 --- a/server/mpm/worker/worker.c +++ b/server/mpm/worker/worker.c @@ -583,7 +583,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t *thd, void * dummy) if (listener_may_exit) break; if (!have_idle_worker) { - rv = ap_queue_info_wait_for_idler(worker_queue_info, NULL); + rv = ap_queue_info_wait_for_idler(worker_queue_info); if (APR_STATUS_IS_EOF(rv)) { break; /* we've been signaled to die now */ } @@ -662,7 +662,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t *thd, void * dummy) if (!listener_may_exit) { /* the following pops a recycled ptrans pool off a stack */ - ap_queue_info_pop_pool(worker_queue_info, &ptrans); + ptrans = ap_queue_info_pop_pool(worker_queue_info); if (ptrans == NULL) { /* we can't use a recycled transaction pool this time. * create a new transaction pool */ @@ -696,7 +696,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t *thd, void * dummy) accept_mutex_error("unlock", rv, process_slot); } if (csd != NULL) { - rv = ap_queue_push_socket(worker_queue, csd, NULL, ptrans); + rv = ap_queue_push_socket(worker_queue, csd, ptrans); if (rv) { /* trash the connection; we couldn't queue the connected * socket to a worker @@ -901,8 +901,7 @@ static void setup_threads_runtime(void) clean_child_exit(APEXIT_CHILDFATAL); } - rv = ap_queue_info_create(&worker_queue_info, pruntime, - threads_per_child, -1); + rv = ap_queue_info_create(&worker_queue_info, pruntime, -1); if (rv != APR_SUCCESS) { ap_log_error(APLOG_MARK, APLOG_ALERT, rv, ap_server_conf, APLOGNO(03141) "ap_queue_info_create() failed"); diff --git a/server/mpm_fdqueue.c b/server/mpm_fdqueue.c index 3697ca722f6..7871597d910 100644 --- a/server/mpm_fdqueue.c +++ b/server/mpm_fdqueue.c @@ -20,7 +20,23 @@ #include -static const apr_uint32_t zero_pt = APR_UINT32_MAX/2; +#define ZERO_PT (APR_UINT32_MAX / 2) + +APR_RING_HEAD(fd_queue_ring, fd_queue_elem_t); + +struct fd_queue_t +{ + struct fd_queue_ring elts; + apr_uint32_t nelts; + apr_uint32_t bounds; + apr_pool_t *spare_pool; + fd_queue_elem_t *spare_elems; + apr_thread_mutex_t *one_big_mutex; + apr_thread_cond_t *not_empty; + apr_uint32_t num_waiters; + apr_uint32_t interrupted; + apr_uint32_t terminated; +}; struct recycled_pool { @@ -30,59 +46,43 @@ struct recycled_pool struct fd_queue_info_t { - apr_uint32_t volatile idlers; /** - * >= zero_pt: number of idle worker threads - * < zero_pt: number of threads blocked, - * waiting for an idle worker - */ + apr_uint32_t volatile idlers; /* >= ZERO_PT: number of idle worker threads + * < ZERO_PT: number of events in backlog + * (waiting for an idle thread) */ apr_thread_mutex_t *idlers_mutex; apr_thread_cond_t *wait_for_idler; - int terminated; - int max_idlers; - int max_recycled_pools; - apr_uint32_t recycled_pools_count; + apr_uint32_t max_idlers; + apr_uint32_t terminated; struct recycled_pool *volatile recycled_pools; + apr_uint32_t recycled_pools_count; + apr_uint32_t max_recycled_pools; }; struct fd_queue_elem_t { - apr_socket_t *sd; - void *sd_baton; - apr_pool_t *p; + APR_RING_ENTRY(fd_queue_elem_t) link; /* in ring */ + struct fd_queue_elem_t *next; /* in spare list */ + sock_event_t self_sock_event; + ap_queue_event_t self_event; + ap_queue_event_t *event; }; -static apr_status_t queue_info_cleanup(void *data_) +static apr_status_t queue_info_cleanup(void *qi) { - fd_queue_info_t *qi = data_; - apr_thread_cond_destroy(qi->wait_for_idler); - apr_thread_mutex_destroy(qi->idlers_mutex); - - /* Clean up any pools in the recycled list */ - for (;;) { - struct recycled_pool *first_pool = qi->recycled_pools; - if (first_pool == NULL) { - break; - } - if (apr_atomic_casptr((void *)&qi->recycled_pools, first_pool->next, - first_pool) == first_pool) { - apr_pool_destroy(first_pool->pool); - } - } - + /* Clean up all pools in the recycled list */ + ap_queue_info_free_idle_pools(qi); return APR_SUCCESS; } -apr_status_t ap_queue_info_create(fd_queue_info_t **queue_info, - apr_pool_t *pool, int max_idlers, - int max_recycled_pools) +AP_DECLARE(apr_status_t) ap_queue_info_create(fd_queue_info_t **queue_info, + apr_pool_t *pool, int max_recycled_pools) { apr_status_t rv; fd_queue_info_t *qi; qi = apr_pcalloc(pool, sizeof(*qi)); - rv = apr_thread_mutex_create(&qi->idlers_mutex, APR_THREAD_MUTEX_DEFAULT, - pool); + rv = apr_thread_mutex_create(&qi->idlers_mutex, APR_THREAD_MUTEX_DEFAULT, pool); if (rv != APR_SUCCESS) { return rv; } @@ -90,27 +90,30 @@ apr_status_t ap_queue_info_create(fd_queue_info_t **queue_info, if (rv != APR_SUCCESS) { return rv; } - qi->recycled_pools = NULL; - qi->max_recycled_pools = max_recycled_pools; - qi->max_idlers = max_idlers; - qi->idlers = zero_pt; + qi->idlers = ZERO_PT; + if (max_recycled_pools >= 0) { + qi->max_recycled_pools = max_recycled_pools; + } + else { + qi->max_recycled_pools = APR_INT32_MAX; + } + apr_pool_cleanup_register(pool, qi, queue_info_cleanup, apr_pool_cleanup_null); *queue_info = qi; - return APR_SUCCESS; } -apr_status_t ap_queue_info_set_idle(fd_queue_info_t *queue_info, - apr_pool_t *pool_to_recycle) +AP_DECLARE(apr_status_t) ap_queue_info_set_idle(fd_queue_info_t *queue_info, + apr_pool_t *pool_to_recycle) { apr_status_t rv; ap_queue_info_push_pool(queue_info, pool_to_recycle); /* If other threads are waiting on a worker, wake one up */ - if (apr_atomic_inc32(&queue_info->idlers) < zero_pt) { + if (apr_atomic_inc32(&queue_info->idlers) < ZERO_PT) { rv = apr_thread_mutex_lock(queue_info->idlers_mutex); if (rv != APR_SUCCESS) { AP_DEBUG_ASSERT(0); @@ -130,23 +133,25 @@ apr_status_t ap_queue_info_set_idle(fd_queue_info_t *queue_info, return APR_SUCCESS; } -apr_status_t ap_queue_info_try_get_idler(fd_queue_info_t *queue_info) +AP_DECLARE(apr_status_t) ap_queue_info_try_get_idler(fd_queue_info_t *queue_info) { /* Don't block if there isn't any idle worker. */ + apr_uint32_t idlers = queue_info->idlers, val; for (;;) { - apr_uint32_t idlers = queue_info->idlers; - if (idlers <= zero_pt) { + if (idlers <= ZERO_PT) { return APR_EAGAIN; } - if (apr_atomic_cas32(&queue_info->idlers, idlers - 1, - idlers) == idlers) { + + val = apr_atomic_cas32(&queue_info->idlers, idlers - 1, idlers); + if (val == idlers) { return APR_SUCCESS; } + + idlers = val; } } -apr_status_t ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info, - int *had_to_block) +AP_DECLARE(apr_status_t) ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info) { apr_status_t rv; @@ -154,7 +159,7 @@ apr_status_t ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info, * apr_atomic_add32(x, -1) does the same as dec32(x), except * that it returns the previous value (unlike dec32's bool). */ - if (apr_atomic_add32(&queue_info->idlers, -1) <= zero_pt) { + if (apr_atomic_add32(&queue_info->idlers, -1) <= ZERO_PT) { rv = apr_thread_mutex_lock(queue_info->idlers_mutex); if (rv != APR_SUCCESS) { AP_DEBUG_ASSERT(0); @@ -177,13 +182,14 @@ apr_status_t ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info, * now non-negative, it's safe for this function to * return immediately. * - * A "negative value" (relative to zero_pt) in + * A "negative value" (relative to ZERO_PT) in * queue_info->idlers tells how many * threads are waiting on an idle worker. */ - if (queue_info->idlers < zero_pt) { - if (had_to_block) { - *had_to_block = 1; + if (apr_atomic_read32(&queue_info->idlers) < ZERO_PT) { + if (queue_info->terminated) { + apr_thread_mutex_unlock(queue_info->idlers_mutex); + return APR_EOF; } rv = apr_thread_cond_wait(queue_info->wait_for_idler, queue_info->idlers_mutex); @@ -199,7 +205,7 @@ apr_status_t ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info, } } - if (queue_info->terminated) { + if (apr_atomic_read32(&queue_info->terminated)) { return APR_EOF; } else { @@ -207,52 +213,75 @@ apr_status_t ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info, } } -apr_uint32_t ap_queue_info_num_idlers(fd_queue_info_t *queue_info) +AP_DECLARE(apr_uint32_t) ap_queue_info_num_idlers(fd_queue_info_t *queue_info) { - apr_uint32_t val; - val = apr_atomic_read32(&queue_info->idlers); - return (val > zero_pt) ? val - zero_pt : 0; + apr_uint32_t val = apr_atomic_read32(&queue_info->idlers); + return (val > ZERO_PT) ? val - ZERO_PT : 0; } -void ap_queue_info_push_pool(fd_queue_info_t *queue_info, - apr_pool_t *pool_to_recycle) +AP_DECLARE(apr_int32_t) ap_queue_info_idlers_count(fd_queue_info_t *queue_info) { - struct recycled_pool *new_recycle; + return apr_atomic_read32(&queue_info->idlers) - ZERO_PT; +} + +AP_DECLARE(apr_int32_t) ap_queue_info_idlers_inc(fd_queue_info_t *queue_info) +{ + /* apr_atomic_add32() returns the previous value, we return the new one */ + return apr_atomic_add32(&queue_info->idlers, +1) + 1 - ZERO_PT; +} + +AP_DECLARE(apr_int32_t) ap_queue_info_idlers_dec(fd_queue_info_t *queue_info) +{ + /* apr_atomic_add32() returns the previous value, we return the new one */ + return apr_atomic_add32(&queue_info->idlers, -1) - 1 - ZERO_PT; +} + +AP_DECLARE(void) ap_queue_info_push_pool(fd_queue_info_t *queue_info, + apr_pool_t *pool_to_recycle) +{ + struct recycled_pool *new_recycle, *first_pool, *val; + apr_uint32_t count; + /* If we have been given a pool to recycle, atomically link * it into the queue_info's list of recycled pools */ if (!pool_to_recycle) return; - if (queue_info->max_recycled_pools >= 0) { - apr_uint32_t n = apr_atomic_read32(&queue_info->recycled_pools_count); - if (n >= queue_info->max_recycled_pools) { - apr_pool_destroy(pool_to_recycle); - return; - } - apr_atomic_inc32(&queue_info->recycled_pools_count); + /* The counting is racy but we don't mind recycling a few more/less pools, + * it's lighter than a compare & swap loop or an inc + dec to back out. + */ + count = apr_atomic_read32(&queue_info->recycled_pools_count); + if (count >= queue_info->max_recycled_pools) { + apr_pool_destroy(pool_to_recycle); + return; } + apr_atomic_inc32(&queue_info->recycled_pools_count); apr_pool_clear(pool_to_recycle); new_recycle = apr_palloc(pool_to_recycle, sizeof *new_recycle); new_recycle->pool = pool_to_recycle; + + first_pool = queue_info->recycled_pools; for (;;) { - /* - * Save queue_info->recycled_pool in local variable next because - * new_recycle->next can be changed after apr_atomic_casptr - * function call. For gory details see PR 44402. + new_recycle->next = first_pool; + val = apr_atomic_casptr((void *)&queue_info->recycled_pools, + new_recycle, first_pool); + /* Don't compare with new_recycle->next because it can change + * after apr_atomic_casptr(). For gory details see PR 44402. */ - struct recycled_pool *next = queue_info->recycled_pools; - new_recycle->next = next; - if (apr_atomic_casptr((void *)&queue_info->recycled_pools, - new_recycle, next) == next) - break; + if (val == first_pool) { + return; + } + + first_pool = val; } } -void ap_queue_info_pop_pool(fd_queue_info_t *queue_info, - apr_pool_t **recycled_pool) +AP_DECLARE(apr_pool_t *) ap_queue_info_pop_pool(fd_queue_info_t *queue_info) { + struct recycled_pool *first_pool, *val; + /* Atomically pop a pool from the recycled list */ /* This function is safe only as long as it is single threaded because @@ -262,41 +291,43 @@ void ap_queue_info_pop_pool(fd_queue_info_t *queue_info, * happen concurrently with a single cas-based pop. */ - *recycled_pool = NULL; - - - /* Atomically pop a pool from the recycled list */ + first_pool = queue_info->recycled_pools; for (;;) { - struct recycled_pool *first_pool = queue_info->recycled_pools; if (first_pool == NULL) { - break; + return NULL; } - if (apr_atomic_casptr((void *)&queue_info->recycled_pools, - first_pool->next, first_pool) == first_pool) { - *recycled_pool = first_pool->pool; - if (queue_info->max_recycled_pools >= 0) - apr_atomic_dec32(&queue_info->recycled_pools_count); - break; + + val = apr_atomic_casptr((void *)&queue_info->recycled_pools, + first_pool->next, first_pool); + if (val == first_pool) { + apr_atomic_dec32(&queue_info->recycled_pools_count); + return first_pool->pool; } + + first_pool = val; } } -void ap_queue_info_free_idle_pools(fd_queue_info_t *queue_info) +AP_DECLARE(void) ap_queue_info_free_idle_pools(fd_queue_info_t *queue_info) { apr_pool_t *p; - queue_info->max_recycled_pools = 0; + /* Atomically free the recycled list */ + + /* Per ap_queue_info_pop_pool() should not be called concurrently, but + * it's only from the listener thread for now. + */ + for (;;) { - ap_queue_info_pop_pool(queue_info, &p); + p = ap_queue_info_pop_pool(queue_info); if (p == NULL) - break; + return; apr_pool_destroy(p); } - apr_atomic_set32(&queue_info->recycled_pools_count, 0); } -apr_status_t ap_queue_info_term(fd_queue_info_t *queue_info) +AP_DECLARE(apr_status_t) ap_queue_info_term(fd_queue_info_t *queue_info) { apr_status_t rv; @@ -305,47 +336,35 @@ apr_status_t ap_queue_info_term(fd_queue_info_t *queue_info) return rv; } - queue_info->terminated = 1; + apr_atomic_set32(&queue_info->terminated, 1); apr_thread_cond_broadcast(queue_info->wait_for_idler); return apr_thread_mutex_unlock(queue_info->idlers_mutex); } -/** +/* + * Lock/unlock the fd_queue_t. + */ +#define queue_lock(q) apr_thread_mutex_lock((q)->one_big_mutex) +#define queue_unlock(q) apr_thread_mutex_unlock((q)->one_big_mutex) + +/* * Detects when the fd_queue_t is full. This utility function is expected * to be called from within critical sections, and is not threadsafe. */ -#define ap_queue_full(queue) ((queue)->nelts == (queue)->bounds) +#define queue_full(q) ((q)->nelts == (q)->bounds) -/** +/* * Detects when the fd_queue_t is empty. This utility function is expected * to be called from within critical sections, and is not threadsafe. */ -#define ap_queue_empty(queue) ((queue)->nelts == 0 && \ - APR_RING_EMPTY(&queue->timers, \ - timer_event_t, link)) +#define queue_empty(q) ((q)->nelts == 0) -/** - * Callback routine that is called to destroy this - * fd_queue_t when its pool is destroyed. - */ -static apr_status_t ap_queue_destroy(void *data) -{ - fd_queue_t *queue = data; - - /* Ignore errors here, we can't do anything about them anyway. - * XXX: We should at least try to signal an error here, it is - * indicative of a programmer error. -aaron */ - apr_thread_cond_destroy(queue->not_empty); - apr_thread_mutex_destroy(queue->one_big_mutex); - - return APR_SUCCESS; -} - -/** +/* * Initialize the fd_queue_t. */ -apr_status_t ap_queue_create(fd_queue_t **pqueue, int capacity, apr_pool_t *p) +AP_DECLARE(apr_status_t) ap_queue_create(fd_queue_t **pqueue, int capacity, + apr_pool_t *p) { apr_status_t rv; fd_queue_t *queue; @@ -361,143 +380,264 @@ apr_status_t ap_queue_create(fd_queue_t **pqueue, int capacity, apr_pool_t *p) return rv; } - APR_RING_INIT(&queue->timers, timer_event_t, link); - - queue->data = apr_pcalloc(p, capacity * sizeof(fd_queue_elem_t)); - queue->bounds = capacity; + apr_pool_create(&queue->spare_pool, p); + APR_RING_INIT(&queue->elts, fd_queue_elem_t, link); + if (capacity > 0) { + queue->bounds = capacity; + } + else { + queue->bounds = APR_UINT32_MAX; + } - apr_pool_cleanup_register(p, queue, ap_queue_destroy, - apr_pool_cleanup_null); *pqueue = queue; - return APR_SUCCESS; } -/** - * Push a new socket onto the queue. - * - * precondition: ap_queue_info_wait_for_idler has already been called - * to reserve an idle worker thread - */ -apr_status_t ap_queue_push_socket(fd_queue_t *queue, - apr_socket_t *sd, void *sd_baton, - apr_pool_t *p) +static APR_INLINE fd_queue_elem_t *get_spare_elem(fd_queue_t *queue) +{ + fd_queue_elem_t *elem = queue->spare_elems; + if (elem == NULL) { + elem = apr_pcalloc(queue->spare_pool, sizeof(*elem)); + } + else { + queue->spare_elems = elem->next; + elem->next = NULL; + } + return elem; +} + +static APR_INLINE void put_spare_elem(fd_queue_t *queue, fd_queue_elem_t *elem) +{ + elem->event = NULL; + elem->next = queue->spare_elems; + queue->spare_elems = elem; +} + +static APR_INLINE void enqueue_elem(fd_queue_t *queue, fd_queue_elem_t *elem, + ap_queue_event_t *event) +{ + if (event) { + elem->event = event; + } + else { + elem->event = &elem->self_event; + } + elem->event->elem = elem; + + APR_RING_INSERT_TAIL(&queue->elts, elem, fd_queue_elem_t, link); + queue->nelts++; +} + +static APR_INLINE void dequeue_elem(fd_queue_t *queue, fd_queue_elem_t *elem) +{ + elem->event->elem = NULL; + ap_assert(queue->nelts > 0); + APR_RING_REMOVE(elem, link); + APR_RING_ELEM_INIT(elem, link); + queue->nelts--; +} + +/* Pushes the last available element to the queue. */ +static void push_elem(fd_queue_t *queue, fd_queue_elem_t **pushed_elem, + ap_queue_event_t *event) { fd_queue_elem_t *elem; + + AP_DEBUG_ASSERT(!queue_full(queue)); + AP_DEBUG_ASSERT(!queue->terminated); + + elem = get_spare_elem(queue); + enqueue_elem(queue, elem, event); + + if (pushed_elem) { + *pushed_elem = elem; + } +} + +/* + * Retrieves the oldest available element from the queue, waiting until one + * becomes available. + */ +static apr_status_t pop_elem(fd_queue_t *queue, fd_queue_elem_t **pelem) +{ apr_status_t rv; - if ((rv = apr_thread_mutex_lock(queue->one_big_mutex)) != APR_SUCCESS) { + for (;;) { + if (queue->terminated) { + return APR_EOF; /* no more elements ever again */ + } + + if (queue->interrupted) { + queue->interrupted--; + return APR_EINTR; + } + + if (!queue_empty(queue)) { + *pelem = APR_RING_FIRST(&queue->elts); + dequeue_elem(queue, *pelem); + return APR_SUCCESS; + } + + queue->num_waiters++; + rv = apr_thread_cond_wait(queue->not_empty, queue->one_big_mutex); + queue->num_waiters--; + if (rv != APR_SUCCESS) { + return rv; + } + } +} + +AP_DECLARE(apr_status_t) ap_queue_push_event(fd_queue_t *queue, + ap_queue_event_t *event) +{ + apr_status_t rv; + + if ((rv = queue_lock(queue)) != APR_SUCCESS) { return rv; } - AP_DEBUG_ASSERT(!queue->terminated); - AP_DEBUG_ASSERT(!ap_queue_full(queue)); - - elem = &queue->data[queue->in++]; - if (queue->in >= queue->bounds) - queue->in -= queue->bounds; - elem->sd = sd; - elem->sd_baton = sd_baton; - elem->p = p; - queue->nelts++; + switch (event->type) { + case AP_QUEUE_EVENT_SOCK: + case AP_QUEUE_EVENT_TIMER: + case AP_QUEUE_EVENT_BATON: + push_elem(queue, NULL, event); + if (event->cb) { + event->cb(event->cb_baton, 1); + } + apr_thread_cond_signal(queue->not_empty); + break; - apr_thread_cond_signal(queue->not_empty); + default: + rv = APR_EINVAL; + break; + } - return apr_thread_mutex_unlock(queue->one_big_mutex); + queue_unlock(queue); + return rv; } -apr_status_t ap_queue_push_timer(fd_queue_t *queue, timer_event_t *te) +AP_DECLARE(apr_status_t) ap_queue_pop_event(fd_queue_t *queue, + ap_queue_event_t **pevent) { apr_status_t rv; + fd_queue_elem_t *elem; + + *pevent = NULL; - if ((rv = apr_thread_mutex_lock(queue->one_big_mutex)) != APR_SUCCESS) { + if ((rv = queue_lock(queue)) != APR_SUCCESS) { return rv; } - AP_DEBUG_ASSERT(!queue->terminated); + rv = pop_elem(queue, &elem); + if (rv == APR_SUCCESS) { + ap_queue_event_t *event = elem->event; + ap_assert(event && event != &elem->self_event); + put_spare_elem(queue, elem); + if (event->cb) { + event->cb(event->cb_baton, 0); + } + *pevent = event; + } - APR_RING_INSERT_TAIL(&queue->timers, te, timer_event_t, link); + queue_unlock(queue); + return rv; +} - apr_thread_cond_signal(queue->not_empty); +AP_DECLARE(void) ap_queue_kill_event_locked(fd_queue_t *queue, + ap_queue_event_t *event) +{ + fd_queue_elem_t *elem = event->elem; + ap_assert(elem && APR_RING_NEXT(elem, link) != elem); - return apr_thread_mutex_unlock(queue->one_big_mutex); + dequeue_elem(queue, elem); + put_spare_elem(queue, elem); + if (event->cb) { + event->cb(event->cb_baton, 0); + } +} + +AP_DECLARE(apr_status_t) ap_queue_lock(fd_queue_t *queue) +{ + return queue_lock(queue); +} + +AP_DECLARE(apr_status_t) ap_queue_unlock(fd_queue_t *queue) +{ + return queue_unlock(queue); } /** - * Retrieves the next available socket from the queue. If there are no - * sockets available, it will block until one becomes available. - * Once retrieved, the socket is placed into the address specified by - * 'sd'. + * Push a socket onto the queue. */ -apr_status_t ap_queue_pop_something(fd_queue_t *queue, - apr_socket_t **sd, void **sd_baton, - apr_pool_t **p, timer_event_t **te_out) +AP_DECLARE(apr_status_t) ap_queue_push_socket(fd_queue_t *queue, apr_socket_t *sd, + apr_pool_t *p) { - fd_queue_elem_t *elem; - timer_event_t *te; apr_status_t rv; + fd_queue_elem_t *elem; + + ap_assert(sd != NULL); - if ((rv = apr_thread_mutex_lock(queue->one_big_mutex)) != APR_SUCCESS) { + if ((rv = queue_lock(queue)) != APR_SUCCESS) { return rv; } - /* Keep waiting until we wake up and find that the queue is not empty. */ - if (ap_queue_empty(queue)) { - if (!queue->terminated) { - apr_thread_cond_wait(queue->not_empty, queue->one_big_mutex); - } - /* If we wake up and it's still empty, then we were interrupted */ - if (ap_queue_empty(queue)) { - rv = apr_thread_mutex_unlock(queue->one_big_mutex); - if (rv != APR_SUCCESS) { - return rv; - } - if (queue->terminated) { - return APR_EOF; /* no more elements ever again */ - } - else { - return APR_EINTR; - } - } + push_elem(queue, &elem, NULL); + elem->event->type = AP_QUEUE_EVENT_SOCK; + elem->event->data.se = &elem->self_sock_event; + elem->event->data.se->baton = NULL; + elem->event->data.se->sd = sd; + elem->event->data.se->p = p; + + apr_thread_cond_signal(queue->not_empty); + + queue_unlock(queue); + return APR_SUCCESS; +} + +/** + * Pop a socket from the queue. + */ +AP_DECLARE(apr_status_t) ap_queue_pop_socket(fd_queue_t *queue, apr_socket_t **psd, + apr_pool_t **pp) +{ + apr_status_t rv; + fd_queue_elem_t *elem; + + if (psd) { + *psd = NULL; + } + if (pp) { + *pp = NULL; } - te = NULL; - if (te_out) { - if (!APR_RING_EMPTY(&queue->timers, timer_event_t, link)) { - te = APR_RING_FIRST(&queue->timers); - APR_RING_REMOVE(te, link); - } - *te_out = te; + if ((rv = queue_lock(queue)) != APR_SUCCESS) { + return rv; } - if (!te) { - elem = &queue->data[queue->out++]; - if (queue->out >= queue->bounds) - queue->out -= queue->bounds; - queue->nelts--; - *sd = elem->sd; - if (sd_baton) { - *sd_baton = elem->sd_baton; + rv = pop_elem(queue, &elem); + if (rv == APR_SUCCESS) { + ap_queue_event_t *event = elem->event; + ap_assert(event && event == &elem->self_event); + ap_assert(event->data.se == &elem->self_sock_event); + ap_assert(event->type == AP_QUEUE_EVENT_SOCK); + if (psd) { + *psd = event->data.se->sd; + } + if (pp) { + *pp = event->data.se->p; } - *p = elem->p; -#ifdef AP_DEBUG - elem->sd = NULL; - elem->p = NULL; -#endif /* AP_DEBUG */ + put_spare_elem(queue, elem); } - return apr_thread_mutex_unlock(queue->one_big_mutex); + queue_unlock(queue); + return rv; } static apr_status_t queue_interrupt(fd_queue_t *queue, int all, int term) { apr_status_t rv; - if (queue->terminated) { - return APR_EOF; - } - - if ((rv = apr_thread_mutex_lock(queue->one_big_mutex)) != APR_SUCCESS) { + if ((rv = queue_lock(queue)) != APR_SUCCESS) { return rv; } @@ -505,15 +645,21 @@ static apr_status_t queue_interrupt(fd_queue_t *queue, int all, int term) * we could end up setting it and waking everybody up just after a * would-be popper checks it but right before they block */ + queue->interrupted = 1; if (term) { queue->terminated = 1; } - if (all) + if (all) { + if (queue->num_waiters > 1) + queue->interrupted += queue->num_waiters - 1; apr_thread_cond_broadcast(queue->not_empty); - else + } + else { apr_thread_cond_signal(queue->not_empty); + } - return apr_thread_mutex_unlock(queue->one_big_mutex); + queue_unlock(queue); + return APR_SUCCESS; } apr_status_t ap_queue_interrupt_all(fd_queue_t *queue) diff --git a/server/mpm_fdqueue.h b/server/mpm_fdqueue.h index 260e22ab80e..29297fd60d5 100644 --- a/server/mpm_fdqueue.h +++ b/server/mpm_fdqueue.h @@ -27,7 +27,7 @@ #include -/* This code is not AP_DECLARE()ed/exported, and used by MPMs event/worker +/* This code is AP_DECLARE()ed/exportedbut used by MPMs event/worker * only (for now), not worth thinking about w/o threads either... */ #if APR_HAS_THREADS @@ -40,28 +40,48 @@ #include #include +struct fd_queue_t; /* opaque */ struct fd_queue_info_t; /* opaque */ struct fd_queue_elem_t; /* opaque */ +typedef struct fd_queue_t fd_queue_t; typedef struct fd_queue_info_t fd_queue_info_t; typedef struct fd_queue_elem_t fd_queue_elem_t; AP_DECLARE(apr_status_t) ap_queue_info_create(fd_queue_info_t **queue_info, - apr_pool_t *pool, int max_idlers, - int max_recycled_pools); + apr_pool_t *pool, int max_recycled_pools); AP_DECLARE(apr_status_t) ap_queue_info_set_idle(fd_queue_info_t *queue_info, apr_pool_t *pool_to_recycle); AP_DECLARE(apr_status_t) ap_queue_info_try_get_idler(fd_queue_info_t *queue_info); -AP_DECLARE(apr_status_t) ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info, - int *had_to_block); +AP_DECLARE(apr_status_t) ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info); AP_DECLARE(apr_uint32_t) ap_queue_info_num_idlers(fd_queue_info_t *queue_info); AP_DECLARE(apr_status_t) ap_queue_info_term(fd_queue_info_t *queue_info); -AP_DECLARE(void) ap_queue_info_pop_pool(fd_queue_info_t *queue_info, - apr_pool_t **recycled_pool); +/* Async API */ +AP_DECLARE(apr_int32_t) ap_queue_info_idlers_inc(fd_queue_info_t *queue_info); +AP_DECLARE(apr_int32_t) ap_queue_info_idlers_dec(fd_queue_info_t *queue_info); +AP_DECLARE(apr_int32_t) ap_queue_info_idlers_count(fd_queue_info_t *queue_info); + +AP_DECLARE(apr_pool_t *) ap_queue_info_pop_pool(fd_queue_info_t *queue_info); AP_DECLARE(void) ap_queue_info_push_pool(fd_queue_info_t *queue_info, apr_pool_t *pool_to_recycle); AP_DECLARE(void) ap_queue_info_free_idle_pools(fd_queue_info_t *queue_info); +enum ap_queue_event_type_e +{ + AP_QUEUE_EVENT_SOCK, + AP_QUEUE_EVENT_TIMER, + AP_QUEUE_EVENT_BATON, +}; +typedef enum ap_queue_event_type_e ap_queue_event_type_e; + +struct sock_event_t +{ + apr_pool_t *p; + apr_socket_t *sd; + void *baton; +}; +typedef struct sock_event_t sock_event_t; + struct timer_event_t { APR_RING_ENTRY(timer_event_t) link; @@ -74,33 +94,47 @@ struct timer_event_t }; typedef struct timer_event_t timer_event_t; -struct fd_queue_t +struct ap_queue_event_t { - APR_RING_HEAD(timers_t, timer_event_t) timers; - fd_queue_elem_t *data; - unsigned int nelts; - unsigned int bounds; - unsigned int in; - unsigned int out; - apr_thread_mutex_t *one_big_mutex; - apr_thread_cond_t *not_empty; - volatile int terminated; + /* event data */ + ap_queue_event_type_e type; + union { + sock_event_t *se; + timer_event_t *te; + void *baton; + } data; + + /* called back when the event is pushed/popped, + * under the queue lock (must not block!) + */ + void (*cb)(void *baton, int pushed); + void *cb_baton; + + /* link in container when queued (for internal use) */ + fd_queue_elem_t *elem; }; -typedef struct fd_queue_t fd_queue_t; +typedef struct ap_queue_event_t ap_queue_event_t; + +AP_DECLARE(apr_status_t) ap_queue_create(fd_queue_t **pqueue, int capacity, + apr_pool_t *p); + +/* mpm_event API (queue of any event) */ +AP_DECLARE(apr_status_t) ap_queue_push_event(fd_queue_t *queue, + ap_queue_event_t *event); +AP_DECLARE(apr_status_t) ap_queue_pop_event(fd_queue_t *queue, + ap_queue_event_t **pevent); +AP_DECLARE(apr_status_t) ap_queue_lock(fd_queue_t *queue); +AP_DECLARE(void) ap_queue_kill_event_locked(fd_queue_t *queue, + ap_queue_event_t *event); +AP_DECLARE(apr_status_t) ap_queue_unlock(fd_queue_t *queue); -AP_DECLARE(apr_status_t) ap_queue_create(fd_queue_t **pqueue, - int capacity, apr_pool_t *p); -AP_DECLARE(apr_status_t) ap_queue_push_socket(fd_queue_t *queue, - apr_socket_t *sd, void *sd_baton, +/* mpm_worker API (queue of socket_event_t only) */ +AP_DECLARE(apr_status_t) ap_queue_push_socket(fd_queue_t *queue, apr_socket_t *sd, apr_pool_t *p); -AP_DECLARE(apr_status_t) ap_queue_push_timer(fd_queue_t *queue, - timer_event_t *te); -AP_DECLARE(apr_status_t) ap_queue_pop_something(fd_queue_t *queue, - apr_socket_t **sd, void **sd_baton, - apr_pool_t **p, timer_event_t **te); -#define ap_queue_pop_socket(q_, s_, p_) \ - ap_queue_pop_something((q_), (s_), NULL, (p_), NULL) +AP_DECLARE(apr_status_t) ap_queue_pop_socket(fd_queue_t *queue, apr_socket_t **psd, + apr_pool_t **pp); +/* common API */ AP_DECLARE(apr_status_t) ap_queue_interrupt_all(fd_queue_t *queue); AP_DECLARE(apr_status_t) ap_queue_interrupt_one(fd_queue_t *queue); AP_DECLARE(apr_status_t) ap_queue_term(fd_queue_t *queue); From aa04f2aab4588075f0f63dc9b19d19f264a7dbfe Mon Sep 17 00:00:00 2001 From: ylavic Date: Fri, 7 Jul 2023 13:04:42 +0200 Subject: [PATCH 11/22] core,mpm_event: Non blocking shutdown. --- include/http_connection.h | 9 ++- include/scoreboard.h | 1 + modules/generators/mod_status.c | 17 +++-- modules/lua/lua_request.c | 4 ++ server/connection.c | 23 +++--- server/mpm/event/event.c | 119 ++++++++++++++++++++++---------- 6 files changed, 120 insertions(+), 53 deletions(-) diff --git a/include/http_connection.h b/include/http_connection.h index 601a4769109..78371efbb27 100644 --- a/include/http_connection.h +++ b/include/http_connection.h @@ -43,10 +43,15 @@ extern "C" { */ AP_CORE_DECLARE(void) ap_process_connection(conn_rec *c, void *csd); +#define AP_SHUTDOWN_CONN_NOFLUSH 0 +#define AP_SHUTDOWN_CONN_FLUSH 1 +#define AP_SHUTDOWN_CONN_WC 2 + /** * Shutdown the connection for writing. * @param c The connection to shutdown - * @param flush Whether or not to flush pending data before + * @param flush Whether to flush pending data before, and if so how to + * (AP_SHUTDOWN_CONN_* flags) * @return APR_SUCCESS or the underlying error */ AP_CORE_DECLARE(apr_status_t) ap_shutdown_conn(conn_rec *c, int flush); @@ -54,7 +59,7 @@ AP_CORE_DECLARE(apr_status_t) ap_shutdown_conn(conn_rec *c, int flush); /** * Flushes all remain data in the client send buffer * @param c The connection to flush - * @remark calls ap_shutdown_conn(c, 1) + * @remark calls ap_shutdown_conn(c, AP_SHUTDOWN_CONN_FLUSH) */ AP_CORE_DECLARE(void) ap_flush_conn(conn_rec *c); diff --git a/include/scoreboard.h b/include/scoreboard.h index e83e52fdb16..581f86b866c 100644 --- a/include/scoreboard.h +++ b/include/scoreboard.h @@ -149,6 +149,7 @@ struct process_score { apr_uint32_t keep_alive; /* async connections in keep alive */ apr_uint32_t suspended; /* connections suspended by some module */ apr_uint32_t wait_io; /* async connections waiting an IO in the MPM */ + apr_uint32_t shutdown; /* async connections shutting down before close */ apr_uint32_t backlog; /* async connections waiting for a worker */ }; diff --git a/modules/generators/mod_status.c b/modules/generators/mod_status.c index f0cff67ac45..5ff635cc96e 100644 --- a/modules/generators/mod_status.c +++ b/modules/generators/mod_status.c @@ -564,8 +564,8 @@ static int status_handler(request_rec *r) ap_rputs("", r); if (is_async) { - int wait_io = 0, write_completion = 0, lingering_close = 0, keep_alive = 0, - connections = 0, stopping = 0, procs = 0; + int wait_io = 0, write_completion = 0, shutdown = 0, lingering_close = 0, + keep_alive = 0, connections = 0, stopping = 0, procs = 0; if (!short_report) ap_rputs("\n\n\n" "" @@ -577,7 +577,7 @@ static int status_handler(request_rec *r) "" "" "" - "\n", r); + "\n", r); for (i = 0; i < server_limit; ++i) { ps_record = ap_get_scoreboard_process(i); if (ps_record->pid) { @@ -585,6 +585,7 @@ static int status_handler(request_rec *r) wait_io += ps_record->wait_io; write_completion += ps_record->write_completion; keep_alive += ps_record->keep_alive; + shutdown += ps_record->shutdown; lingering_close += ps_record->lingering_close; procs++; if (ps_record->quiescing) { @@ -601,7 +602,7 @@ static int status_handler(request_rec *r) ap_rprintf(r, "" "" "" - "" + "" "" "\n", i, ps_record->pid, @@ -614,6 +615,7 @@ static int status_handler(request_rec *r) ps_record->wait_io, ps_record->write_completion, ps_record->keep_alive, + ps_record->shutdown, ps_record->lingering_close); } } @@ -622,14 +624,14 @@ static int status_handler(request_rec *r) ap_rprintf(r, "" "" "" - "" + "" "" "\n
Slot
totalacceptingbusygracefulidlewait-iowritingkeep-aliveclosing
shutdownclosing
%u%" APR_PID_T_FMT "%s%s%u%s%u%u%u%u%u%u%u%u%u%u%u
Sum%d%d%d %d%d%d%d%d%d%d%d%d%d%d
\n", procs, stopping, connections, busy, graceful, idle, wait_io, write_completion, keep_alive, - lingering_close); + shutdown, lingering_close); } else { ap_rprintf(r, "Processes: %d\n" @@ -638,11 +640,12 @@ static int status_handler(request_rec *r) "ConnsAsyncWaitIO: %d\n" "ConnsAsyncWriting: %d\n" "ConnsAsyncKeepAlive: %d\n" + "ConnsAsyncShutdown: %d\n" "ConnsAsyncClosing: %d\n", procs, stopping, connections, wait_io, write_completion, keep_alive, - lingering_close); + shutdown, lingering_close); } } diff --git a/modules/lua/lua_request.c b/modules/lua/lua_request.c index 5fa3a968c6b..f93c3493af4 100644 --- a/modules/lua/lua_request.c +++ b/modules/lua/lua_request.c @@ -1276,6 +1276,10 @@ static int lua_ap_scoreboard_process(lua_State *L) lua_pushnumber(L, ps_record->write_completion); lua_settable(L, -3); + lua_pushstring(L, "shutdown"); + lua_pushnumber(L, ps_record->shutdown); + lua_settable(L, -3); + lua_pushstring(L, "not_accepting"); lua_pushnumber(L, ps_record->not_accepting); lua_settable(L, -3); diff --git a/server/connection.c b/server/connection.c index a1c4c1860f0..383b769660f 100644 --- a/server/connection.c +++ b/server/connection.c @@ -111,37 +111,42 @@ AP_CORE_DECLARE(apr_status_t) ap_shutdown_conn(conn_rec *c, int flush) apr_bucket_brigade *bb; apr_bucket *b; - bb = apr_brigade_create(c->pool, c->bucket_alloc); + bb = ap_acquire_brigade(c); - if (flush) { + if (flush == AP_SHUTDOWN_CONN_WC) { + /* Write Completion bucket */ + b = ap_bucket_wc_create(c->bucket_alloc); + } + else { /* FLUSH bucket */ b = apr_bucket_flush_create(c->bucket_alloc); - APR_BRIGADE_INSERT_TAIL(bb, b); } + APR_BRIGADE_INSERT_TAIL(bb, b); /* End Of Connection bucket */ b = ap_bucket_eoc_create(c->bucket_alloc); APR_BRIGADE_INSERT_TAIL(bb, b); rv = ap_pass_brigade(c->output_filters, bb); - apr_brigade_destroy(bb); + ap_release_brigade(c, bb); return rv; } AP_CORE_DECLARE(void) ap_flush_conn(conn_rec *c) { - (void)ap_shutdown_conn(c, 1); + (void)ap_shutdown_conn(c, AP_SHUTDOWN_CONN_FLUSH); } AP_DECLARE(int) ap_prep_lingering_close(conn_rec *c) { /* Give protocol handlers one last chance to raise their voice */ - ap_run_pre_close_connection(c); + int rc = ap_run_pre_close_connection(c); if (c->sbh) { ap_update_child_status(c->sbh, SERVER_CLOSING, NULL); } - return 0; + + return (rc == DECLINED) ? OK : rc; } /* we now proceed to read from the client until we get EOF, or until @@ -172,7 +177,9 @@ AP_DECLARE(int) ap_start_lingering_close(conn_rec *c) */ /* Send any leftover data to the client, but never try to again */ - ap_flush_conn(c); + if (ap_shutdown_conn(c, AP_SHUTDOWN_CONN_FLUSH)) { + return 1; + } #ifdef NO_LINGCLOSE return 1; diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 5a9f4b676b4..8c5bee23115 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -301,7 +301,9 @@ struct event_conn_state_t { /** Is lingering close from defer_lingering_close()? */ deferred_linger :1, /** Has ap_start_lingering_close() been called? */ - linger_started :1; + linger_started :1, + /** Is lingering connection flushed and shutdown? */ + linger_shutdown :1; }; #define cs_se(cs) (&(cs)->bse.se) #define cs_qe(cs) (&(cs)->bse.qe) @@ -455,6 +457,7 @@ struct timeout_queue { * waitio_q uses vhost's TimeOut * write_completion_q uses vhost's TimeOut * keepalive_q uses vhost's KeepAliveTimeOut + * shutdown_q uses vhost's TimeOut * linger_q uses MAX_SECS_TO_LINGER * short_linger_q uses SECONDS_TO_LINGER * backlog_q uses vhost's TimeOut @@ -462,6 +465,7 @@ struct timeout_queue { static struct timeout_queue *waitio_q, /* wait for I/O to happen */ *write_completion_q, /* completion or user async poll */ *keepalive_q, /* in between requests */ + *shutdown_q, /* shutting down (write) before close */ *linger_q, /* lingering (read) before close */ *short_linger_q, /* lingering (read) before close (short timeout) */ *backlog_q; /* waiting for a worker */ @@ -658,6 +662,7 @@ struct event_srv_cfg_s { struct timeout_queue *io_q, *wc_q, *ka_q, + *sh_q, *bl_q; server_rec *s; /* backref */ }; @@ -724,14 +729,15 @@ static int disable_listensocks(void) ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(10381) "Suspend listening sockets: idlers:%i conns:%u backlog:%u " - "waitio:%u write:%u keepalive:%u linger:%u/%u " - "timers:%u suspended:%u", + "waitio:%u write:%u keepalive:%u shutdown:%u " + "linger:%u/%u timers:%u suspended:%u", ap_queue_info_idlers_count(worker_queue_info), apr_atomic_read32(&connection_count), apr_atomic_read32(backlog_q->total), apr_atomic_read32(waitio_q->total), apr_atomic_read32(write_completion_q->total), apr_atomic_read32(keepalive_q->total), + apr_atomic_read32(shutdown_q->total), apr_atomic_read32(linger_q->total), apr_atomic_read32(short_linger_q->total), apr_atomic_read32(&timers_count), @@ -756,14 +762,15 @@ static int enable_listensocks(void) ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(00457) "Resume listening sockets: idlers:%i conns:%u backlog:%u " - "waitio:%u write:%u keepalive:%u linger:%u/%u " - "timers:%u suspended:%u", + "waitio:%u write:%u keepalive:%u shutdown:%u " + "linger:%u/%u timers:%u suspended:%u", ap_queue_info_idlers_count(worker_queue_info), apr_atomic_read32(&connection_count), apr_atomic_read32(backlog_q->total), apr_atomic_read32(waitio_q->total), apr_atomic_read32(write_completion_q->total), apr_atomic_read32(keepalive_q->total), + apr_atomic_read32(shutdown_q->total), apr_atomic_read32(linger_q->total), apr_atomic_read32(short_linger_q->total), apr_atomic_read32(&timers_count), @@ -1917,7 +1924,7 @@ static void push2worker(event_conn_state_t *cs, timer_event_t *te, ap_assert(!cs_in_backlog(cs)); ap_assert(!cs->q); - if (busy && cs->pub.state == CONN_STATE_LINGER && cs->linger_started) { + if (busy && cs->pub.state == CONN_STATE_LINGER && cs->linger_shutdown) { /* Not worth lingering more on this connection if we are short of * workers and everything is flushed+shutdown already, back out * and close. @@ -2201,19 +2208,53 @@ static void process_lingering_close(event_conn_state_t *cs) AP_DEBUG_ASSERT(CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)); /* Flush and shutdown first */ - if (!cs->linger_started) { - cs->linger_started = 1; /* once! */ - apr_atomic_inc32(&lingering_count); + if (!cs->linger_shutdown) { + conn_rec *c = cs->c; + int rc = OK; + cs->pub.state = CONN_STATE_LINGER; - apr_socket_timeout_set(csd, apr_time_from_sec(SECONDS_TO_LINGER)); - if (ap_start_lingering_close(cs->c)) { + if (!cs->linger_started) { + cs->linger_started = 1; /* once! */ + apr_atomic_inc32(&lingering_count); notify_suspend(cs); + + /* Shutdown the connection, i.e. pre_connection_close hooks, + * SSL/TLS close notify, WC bucket, etc.. + */ + rc = ap_prep_lingering_close(c); + if (rc == OK) { + rc = ap_shutdown_conn(c, AP_SHUTDOWN_CONN_WC); + if (rc == OK) { + if (c->aborted) { + rc = DONE; + } + else if (ap_filter_should_yield(c->output_filters)) { + rc = AGAIN; + } + } + } + } + else { + rc = ap_check_output_pending(c); + } + + cs->pub.state = CONN_STATE_LINGER; + cs->pub.sense = CONN_SENSE_DEFAULT; + if (rc == AGAIN) { + ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, + "queuing lingering close for connection %" CS_FMT, + CS_ARG(cs)); + if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->sh_q, NULL)) { + return; /* queued */ + } + } + if (rc != OK || apr_socket_shutdown(csd, APR_SHUTDOWN_WRITE)) { close_connection(cs); return; } - - notify_suspend(cs); + + cs->linger_shutdown = 1; /* once! */ /* All nonblocking from now, no need for APR_INCOMPLETE_READ either */ apr_socket_timeout_set(csd, 0); @@ -2230,7 +2271,6 @@ static void process_lingering_close(event_conn_state_t *cs) else { cs->pub.state = CONN_STATE_LINGER_NORMAL; } - cs->pub.sense = CONN_SENSE_DEFAULT; } /* Drain until EAGAIN or EOF/error, in the former case requeue and @@ -2729,32 +2769,30 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) /* Recompute this by walking the timeout queues (under the lock) */ queues_next_expiry = 0; - /* Step 1: keepalive queue timeouts */ + /* Process shutdown_q first because the expired entries from the + * other queues will go there and don't need to be checked twice + * (nor do we want to potentially kill them before the shutdown). + */ + process_timeout_queue(shutdown_q, now); + + process_timeout_queue(waitio_q, now); + process_timeout_queue(write_completion_q, now); + + /* The linger and keepalive queues can be shrinked any time + * under pressure. + */ if (workers_were_busy || dying) { + shrink_timeout_queue(linger_q, now); + shrink_timeout_queue(short_linger_q, now); shrink_timeout_queue(keepalive_q, now); } else { + process_timeout_queue(linger_q, now); + process_timeout_queue(short_linger_q, now); process_timeout_queue(keepalive_q, now); } - /* Step 2: waitio queue timeouts */ - process_timeout_queue(waitio_q, now); - - /* Step 3: write completion queue timeouts */ - process_timeout_queue(write_completion_q, now); - - /* Step 4: normal lingering close queue timeouts */ - if (dying && linger_q->timeout > short_linger_q->timeout) { - /* Dying, force short timeout for normal lingering close */ - linger_q->timeout = short_linger_q->timeout; - } - process_timeout_queue(linger_q, now); - - /* Step 5: short lingering close queue timeouts */ - process_timeout_queue(short_linger_q, now); - - /* Step 6: backlog queue timeouts - * Connections in backlog race with the workers (dequeuing) under + /* Connections in backlog race with the workers (dequeuing) under * the worker_queue mutex. */ if (apr_atomic_read32(backlog_q->total)) { @@ -2773,14 +2811,19 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) ps->wait_io = apr_atomic_read32(waitio_q->total); ps->write_completion = apr_atomic_read32(write_completion_q->total); ps->keep_alive = apr_atomic_read32(keepalive_q->total); + ps->shutdown = apr_atomic_read32(shutdown_q->total); ps->lingering_close = apr_atomic_read32(&lingering_count); ps->backlog = apr_atomic_read32(backlog_q->total); ps->suspended = apr_atomic_read32(&suspended_count); ps->connections = apr_atomic_read32(&connection_count); } else if ((workers_were_busy || dying) - && apr_atomic_read32(keepalive_q->total)) { + && (apr_atomic_read32(linger_q->total) + || apr_atomic_read32(short_linger_q->total) + || apr_atomic_read32(keepalive_q->total))) { apr_thread_mutex_lock(timeout_mutex); + shrink_timeout_queue(linger_q, now); + shrink_timeout_queue(short_linger_q, now); shrink_timeout_queue(keepalive_q, now); apr_thread_mutex_unlock(timeout_mutex); ps->keep_alive = 0; @@ -4444,7 +4487,7 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog, static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog, apr_pool_t *ptemp, server_rec *s) { - apr_hash_t *io_h, *wc_h, *ka_h, *bl_h; + apr_hash_t *io_h, *wc_h, *ka_h, *sh_h, *bl_h; /* Not needed in pre_config stage */ if (ap_state_query(AP_SQ_MAIN_STATE) == AP_SQ_MS_CREATE_PRE_CONFIG) { @@ -4454,6 +4497,7 @@ static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog, io_h = apr_hash_make(ptemp); wc_h = apr_hash_make(ptemp); ka_h = apr_hash_make(ptemp); + sh_h = apr_hash_make(ptemp); bl_h = apr_hash_make(ptemp); linger_q = TO_QUEUE_MAKE(pconf, "linger", @@ -4475,8 +4519,11 @@ static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog, sc->ka_q = TO_QUEUE_CHAIN(pconf, "keepalive", s->keep_alive_timeout, &keepalive_q, ka_h, ptemp); + sc->sh_q = TO_QUEUE_CHAIN(pconf, "shutdown", s->timeout, + &shutdown_q, sh_h, ptemp); + sc->bl_q = TO_QUEUE_CHAIN(pconf, "backlog", s->timeout, - &backlog_q, bl_h, ptemp); + &backlog_q, bl_h, ptemp); } return OK; From 364a3894b3b6c80211d615c6722f7607c3fe9d82 Mon Sep 17 00:00:00 2001 From: ylavic Date: Wed, 10 Jul 2024 15:08:28 +0200 Subject: [PATCH 12/22] mpm_event: Don't shrink keepalive queue when busy/exiting. --- server/mpm/event/event.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 8c5bee23115..f341f1daf87 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -1669,7 +1669,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, goto process_connection; } } - if (pending != OK || listener_may_exit) { + if (pending != OK) { cs->pub.state = CONN_STATE_LINGER; goto lingering_close; } @@ -2280,7 +2280,7 @@ static void process_lingering_close(event_conn_state_t *cs) apr_size_t nbytes = sizeof(dummybuf); rv = apr_socket_recv(csd, dummybuf, &nbytes); } while (rv == APR_SUCCESS); - if (APR_STATUS_IS_EAGAIN(rv)) { + if (APR_STATUS_IS_EAGAIN(rv) && !listensocks_disabled()) { struct timeout_queue *q; q = (cs->pub.state == CONN_STATE_LINGER_SHORT) ? short_linger_q : linger_q; if (pollset_add(cs, CONN_SENSE_WANT_READ, q, NULL)) { @@ -2777,19 +2777,16 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) process_timeout_queue(waitio_q, now); process_timeout_queue(write_completion_q, now); + process_timeout_queue(keepalive_q, now); - /* The linger and keepalive queues can be shrinked any time - * under pressure. - */ + /* The linger queues can be shrinked any time under pressure */ if (workers_were_busy || dying) { shrink_timeout_queue(linger_q, now); shrink_timeout_queue(short_linger_q, now); - shrink_timeout_queue(keepalive_q, now); } else { process_timeout_queue(linger_q, now); process_timeout_queue(short_linger_q, now); - process_timeout_queue(keepalive_q, now); } /* Connections in backlog race with the workers (dequeuing) under @@ -2819,14 +2816,11 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) } else if ((workers_were_busy || dying) && (apr_atomic_read32(linger_q->total) - || apr_atomic_read32(short_linger_q->total) - || apr_atomic_read32(keepalive_q->total))) { + || apr_atomic_read32(short_linger_q->total))) { apr_thread_mutex_lock(timeout_mutex); shrink_timeout_queue(linger_q, now); shrink_timeout_queue(short_linger_q, now); - shrink_timeout_queue(keepalive_q, now); apr_thread_mutex_unlock(timeout_mutex); - ps->keep_alive = 0; } } /* listener main loop */ From eb1eb7fb894dab129efd0f1181f3e2fd1a95ef74 Mon Sep 17 00:00:00 2001 From: ylavic Date: Mon, 26 Jun 2023 19:26:58 +0200 Subject: [PATCH 13/22] mpm_event: Single linger queue/timeout (short one, 2s). --- server/mpm/event/event.c | 128 +++++++++++++-------------------------- 1 file changed, 41 insertions(+), 87 deletions(-) diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index f341f1daf87..0058ba20994 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -155,12 +155,8 @@ #define apr_time_from_msec(x) ((x) * 1000) #endif -#define CONN_STATE_IS_LINGERING_CLOSE(s) ((s) >= CONN_STATE_LINGER && \ - (s) <= CONN_STATE_LINGER_SHORT) -#ifndef MAX_SECS_TO_LINGER -#define MAX_SECS_TO_LINGER 30 -#endif -#define SECONDS_TO_LINGER 2 +/* Lingering close (read) timeout */ +#define LINGER_READ_TIMEOUT apr_time_from_sec(2) /* Don't wait more time in poll() if APR_POLLSET_WAKEABLE is not implemented */ #define NON_WAKEABLE_TIMEOUT apr_time_from_msec(100) @@ -204,7 +200,6 @@ static volatile int start_thread_may_exit = 0; static volatile int listener_may_exit = 0; static apr_uint32_t connection_count = 0; /* Number of open connections */ static apr_uint32_t timers_count = 0; /* Number of queued timers */ -static apr_uint32_t lingering_count = 0; /* Number of connections in lingering close */ static apr_uint32_t suspended_count = 0; /* Number of suspended connections */ static apr_uint32_t threads_shutdown = 0; /* Number of threads that have shutdown early during graceful termination */ @@ -458,8 +453,7 @@ struct timeout_queue { * write_completion_q uses vhost's TimeOut * keepalive_q uses vhost's KeepAliveTimeOut * shutdown_q uses vhost's TimeOut - * linger_q uses MAX_SECS_TO_LINGER - * short_linger_q uses SECONDS_TO_LINGER + * linger_q uses LINGER_READ_TIMEOUT * backlog_q uses vhost's TimeOut */ static struct timeout_queue *waitio_q, /* wait for I/O to happen */ @@ -467,7 +461,6 @@ static struct timeout_queue *waitio_q, /* wait for I/O to happen */ *keepalive_q, /* in between requests */ *shutdown_q, /* shutting down (write) before close */ *linger_q, /* lingering (read) before close */ - *short_linger_q, /* lingering (read) before close (short timeout) */ *backlog_q; /* waiting for a worker */ static volatile apr_time_t queues_next_expiry; /* next expiry time accross all queues */ @@ -730,7 +723,7 @@ static int disable_listensocks(void) ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(10381) "Suspend listening sockets: idlers:%i conns:%u backlog:%u " "waitio:%u write:%u keepalive:%u shutdown:%u " - "linger:%u/%u timers:%u suspended:%u", + "linger:%u timers:%u suspended:%u", ap_queue_info_idlers_count(worker_queue_info), apr_atomic_read32(&connection_count), apr_atomic_read32(backlog_q->total), @@ -739,7 +732,6 @@ static int disable_listensocks(void) apr_atomic_read32(keepalive_q->total), apr_atomic_read32(shutdown_q->total), apr_atomic_read32(linger_q->total), - apr_atomic_read32(short_linger_q->total), apr_atomic_read32(&timers_count), apr_atomic_read32(&suspended_count)); @@ -763,7 +755,7 @@ static int enable_listensocks(void) ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(00457) "Resume listening sockets: idlers:%i conns:%u backlog:%u " "waitio:%u write:%u keepalive:%u shutdown:%u " - "linger:%u/%u timers:%u suspended:%u", + "linger:%u timers:%u suspended:%u", ap_queue_info_idlers_count(worker_queue_info), apr_atomic_read32(&connection_count), apr_atomic_read32(backlog_q->total), @@ -772,7 +764,6 @@ static int enable_listensocks(void) apr_atomic_read32(keepalive_q->total), apr_atomic_read32(shutdown_q->total), apr_atomic_read32(linger_q->total), - apr_atomic_read32(short_linger_q->total), apr_atomic_read32(&timers_count), apr_atomic_read32(&suspended_count)); @@ -798,7 +789,7 @@ static APR_INLINE int connections_above_limit(int *busy) apr_int32_t i_count = ap_queue_info_idlers_count(worker_queue_info); if (i_count > 0) { apr_uint32_t c_count = apr_atomic_read32(&connection_count); - apr_uint32_t l_count = apr_atomic_read32(&lingering_count); + apr_uint32_t l_count = apr_atomic_read32(linger_q->total); if (c_count <= l_count /* Off by 'listensocks_disabled()' to avoid flip flop */ || c_count - l_count < (apr_uint32_t)threads_per_child + @@ -1092,17 +1083,12 @@ static apr_status_t decrement_connection_count(void *cs_) CS_ARG_TO(cs)); switch (cs->pub.state) { - case CONN_STATE_LINGER: - case CONN_STATE_LINGER_NORMAL: - case CONN_STATE_LINGER_SHORT: - apr_atomic_dec32(&lingering_count); - break; - case CONN_STATE_SUSPENDED: - apr_atomic_dec32(&suspended_count); - break; - default: - break; + case CONN_STATE_SUSPENDED: + apr_atomic_dec32(&suspended_count); + default: + break; } + /* Unblock the listener if it's waiting for connection_count = 0, * or if the listening sockets were disabled due to limits and can * now accept new connections. @@ -1185,7 +1171,7 @@ static void push2worker(event_conn_state_t *cs, timer_event_t *te, apr_time_t now, int *busy); /* Shutdown the connection in case of timeout, error or resources shortage. - * This starts short lingering close if not already there, or directly closes + * This starts lingering close if not already there, or directly closes * the connection otherwise. * Pre-condition: nonblocking, can be called from anywhere provided cs is not * in the pollset nor any non-backlog timeout queue. @@ -1199,8 +1185,6 @@ static void shutdown_connection(event_conn_state_t *cs, apr_time_t now, int log_level = APLOG_INFO; switch (cs->pub.state) { case CONN_STATE_LINGER: - case CONN_STATE_LINGER_NORMAL: - case CONN_STATE_LINGER_SHORT: case CONN_STATE_KEEPALIVE: log_level = APLOG_TRACE2; default: @@ -1214,8 +1198,7 @@ static void shutdown_connection(event_conn_state_t *cs, apr_time_t now, /* Don't re-schedule connections in lingering close, they had * their chance already so just close them now. */ - if (!CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) { - apr_table_setn(cs->c->notes, "short-lingering-close", "1"); + if (cs->pub.state != CONN_STATE_LINGER) { cs->pub.state = CONN_STATE_LINGER; push2worker(cs, NULL, now, NULL); } @@ -1530,7 +1513,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, "processing connection %" CS_FMT " (aborted %d, clogging %d)", CS_ARG(cs), c->aborted, c->clogging_input_filters); - if (CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) { + if (cs->pub.state == CONN_STATE_LINGER) { goto lingering_close; } @@ -1628,7 +1611,6 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, q = cs->sc->io_q; } if (!pollset_add(cs, CONN_SENSE_WANT_READ, q, te)) { - apr_table_setn(cs->c->notes, "short-lingering-close", "1"); cs->pub.state = CONN_STATE_LINGER; goto lingering_close; } @@ -1658,7 +1640,6 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, return; /* queued */ } /* Fall through lingering close */ - apr_table_setn(cs->c->notes, "short-lingering-close", "1"); } else if (pending == OK) { /* Some data to process immediately? */ @@ -1692,7 +1673,6 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, notify_suspend(cs); if (!pollset_add(cs, CONN_SENSE_WANT_READ, cs->ka_sc->ka_q, NULL)) { - apr_table_setn(cs->c->notes, "short-lingering-close", "1"); cs->pub.state = CONN_STATE_LINGER; goto lingering_close; } @@ -1730,16 +1710,15 @@ static apr_status_t event_resume_suspended (conn_rec *c) c->suspended_baton = NULL; cs->pub.sense = CONN_SENSE_DEFAULT; - if (!CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) { + if (cs->pub.state != CONN_STATE_LINGER) { cs->pub.state = CONN_STATE_WRITE_COMPLETION; if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q, NULL)) { return APR_SUCCESS; /* queued */ } /* fall through lingering close on error */ - apr_table_setn(cs->c->notes, "short-lingering-close", "1"); + cs->pub.state = CONN_STATE_LINGER; } - cs->pub.state = CONN_STATE_LINGER; process_lingering_close(cs); return APR_SUCCESS; } @@ -2205,7 +2184,7 @@ static void process_lingering_close(event_conn_state_t *cs) ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, "lingering close for connection %" CS_FMT, CS_ARG(cs)); - AP_DEBUG_ASSERT(CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)); + AP_DEBUG_ASSERT(cs->pub.state == CONN_STATE_LINGER); /* Flush and shutdown first */ if (!cs->linger_shutdown) { @@ -2216,7 +2195,6 @@ static void process_lingering_close(event_conn_state_t *cs) if (!cs->linger_started) { cs->linger_started = 1; /* once! */ - apr_atomic_inc32(&lingering_count); notify_suspend(cs); /* Shutdown the connection, i.e. pre_connection_close hooks, @@ -2259,18 +2237,6 @@ static void process_lingering_close(event_conn_state_t *cs) /* All nonblocking from now, no need for APR_INCOMPLETE_READ either */ apr_socket_timeout_set(csd, 0); apr_socket_opt_set(csd, APR_INCOMPLETE_READ, 0); - - /* - * If some module requested a shortened waiting period, only wait for - * 2s (SECONDS_TO_LINGER). This is useful for mitigating certain - * DoS attacks. - */ - if (apr_table_get(cs->c->notes, "short-lingering-close")) { - cs->pub.state = CONN_STATE_LINGER_SHORT; - } - else { - cs->pub.state = CONN_STATE_LINGER_NORMAL; - } } /* Drain until EAGAIN or EOF/error, in the former case requeue and @@ -2280,14 +2246,12 @@ static void process_lingering_close(event_conn_state_t *cs) apr_size_t nbytes = sizeof(dummybuf); rv = apr_socket_recv(csd, dummybuf, &nbytes); } while (rv == APR_SUCCESS); - if (APR_STATUS_IS_EAGAIN(rv) && !listensocks_disabled()) { - struct timeout_queue *q; - q = (cs->pub.state == CONN_STATE_LINGER_SHORT) ? short_linger_q : linger_q; - if (pollset_add(cs, CONN_SENSE_WANT_READ, q, NULL)) { - return; /* queued */ - } + + if (!APR_STATUS_IS_EAGAIN(rv) + || listensocks_disabled() /* busy enough */ + || !pollset_add(cs, CONN_SENSE_WANT_READ, linger_q, NULL)) { + close_connection(cs); } - close_connection(cs); } /* Call shutdown_connection() for the elements of 'q' that timed out, or @@ -2437,22 +2401,20 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) } } - if (APLOGtrace6(ap_server_conf)) { - /* trace log status every second */ - if (now - last_log > apr_time_from_sec(1)) { - ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf, - "connections: %u (waitio:%u write:%u keepalive:%u " - "lingering:%u suspended:%u), workers: %u/%u shutdown", - apr_atomic_read32(&connection_count), - apr_atomic_read32(waitio_q->total), - apr_atomic_read32(write_completion_q->total), - apr_atomic_read32(keepalive_q->total), - apr_atomic_read32(&lingering_count), - apr_atomic_read32(&suspended_count), - apr_atomic_read32(&threads_shutdown), - threads_per_child); - last_log = now; - } + /* trace log status every second */ + if (APLOGtrace6(ap_server_conf) && now - last_log > apr_time_from_sec(1)) { + ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf, + "connections: %u (waitio:%d write:%d keepalive:%d " + "lingering:%d suspended:%u), workers: %u/%u shutdown", + apr_atomic_read32(&connection_count), + apr_atomic_read32(waitio_q->total), + apr_atomic_read32(write_completion_q->total), + apr_atomic_read32(keepalive_q->total), + apr_atomic_read32(linger_q->total), + apr_atomic_read32(&suspended_count), + apr_atomic_read32(&threads_shutdown), + threads_per_child); + last_log = now; } #if HAVE_SERF @@ -2608,8 +2570,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) case CONN_STATE_ASYNC_WAITIO: cs->pub.state = CONN_STATE_PROCESSING; case CONN_STATE_WRITE_COMPLETION: - case CONN_STATE_LINGER_NORMAL: - case CONN_STATE_LINGER_SHORT: + case CONN_STATE_LINGER: break; default: @@ -2779,14 +2740,12 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) process_timeout_queue(write_completion_q, now); process_timeout_queue(keepalive_q, now); - /* The linger queues can be shrinked any time under pressure */ + /* The linger queue can be shrinked any time under pressure */ if (workers_were_busy || dying) { shrink_timeout_queue(linger_q, now); - shrink_timeout_queue(short_linger_q, now); } else { process_timeout_queue(linger_q, now); - process_timeout_queue(short_linger_q, now); } /* Connections in backlog race with the workers (dequeuing) under @@ -2809,17 +2768,15 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) ps->write_completion = apr_atomic_read32(write_completion_q->total); ps->keep_alive = apr_atomic_read32(keepalive_q->total); ps->shutdown = apr_atomic_read32(shutdown_q->total); - ps->lingering_close = apr_atomic_read32(&lingering_count); + ps->lingering_close = apr_atomic_read32(linger_q->total); ps->backlog = apr_atomic_read32(backlog_q->total); ps->suspended = apr_atomic_read32(&suspended_count); ps->connections = apr_atomic_read32(&connection_count); } else if ((workers_were_busy || dying) - && (apr_atomic_read32(linger_q->total) - || apr_atomic_read32(short_linger_q->total))) { + && apr_atomic_read32(linger_q->total)) { apr_thread_mutex_lock(timeout_mutex); shrink_timeout_queue(linger_q, now); - shrink_timeout_queue(short_linger_q, now); apr_thread_mutex_unlock(timeout_mutex); } } /* listener main loop */ @@ -4494,10 +4451,7 @@ static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog, sh_h = apr_hash_make(ptemp); bl_h = apr_hash_make(ptemp); - linger_q = TO_QUEUE_MAKE(pconf, "linger", - apr_time_from_sec(MAX_SECS_TO_LINGER), NULL); - short_linger_q = TO_QUEUE_MAKE(pconf, "short_linger", - apr_time_from_sec(SECONDS_TO_LINGER), NULL); + linger_q = TO_QUEUE_MAKE(pconf, "linger", LINGER_READ_TIMEOUT, NULL); for (; s; s = s->next) { event_srv_cfg *sc = apr_pcalloc(pconf, sizeof *sc); From c82d67ad99dd9f725135a8a0e5d8bf87b9a9b2d8 Mon Sep 17 00:00:00 2001 From: ylavic Date: Mon, 26 Jun 2023 21:55:25 +0200 Subject: [PATCH 14/22] mpm_event: Periodic linger queue shrink (500ms). --- server/mpm/event/event.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 0058ba20994..2d33613c41f 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -158,6 +158,9 @@ /* Lingering close (read) timeout */ #define LINGER_READ_TIMEOUT apr_time_from_sec(2) +/* Shrink linger_q at this period (min) when busy */ +#define QUEUES_SHRINK_TIMEOUT apr_time_from_msec(500) + /* Don't wait more time in poll() if APR_POLLSET_WAKEABLE is not implemented */ #define NON_WAKEABLE_TIMEOUT apr_time_from_msec(100) @@ -2348,7 +2351,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) proc_info *ti = dummy; int process_slot = ti->pslot; process_score *ps = ap_get_scoreboard_process(process_slot); - apr_time_t last_log; + apr_time_t last_log, next_shrink_time = 0; last_log = event_time_now(); free(ti); @@ -2743,6 +2746,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) /* The linger queue can be shrinked any time under pressure */ if (workers_were_busy || dying) { shrink_timeout_queue(linger_q, now); + next_shrink_time = now + QUEUES_SHRINK_TIMEOUT; } else { process_timeout_queue(linger_q, now); @@ -2773,11 +2777,13 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) ps->suspended = apr_atomic_read32(&suspended_count); ps->connections = apr_atomic_read32(&connection_count); } - else if ((workers_were_busy || dying) + else if (next_shrink_time <= now + && (workers_were_busy || dying) && apr_atomic_read32(linger_q->total)) { apr_thread_mutex_lock(timeout_mutex); shrink_timeout_queue(linger_q, now); apr_thread_mutex_unlock(timeout_mutex); + next_shrink_time = now + QUEUES_SHRINK_TIMEOUT; } } /* listener main loop */ From eddf29957cfe27fb6243a16767d1508fa1295254 Mon Sep 17 00:00:00 2001 From: ylavic Date: Mon, 26 Jun 2023 20:05:33 +0200 Subject: [PATCH 15/22] mpm_event: Use atomic reads/writes for shared resources. --- server/mpm/event/event.c | 123 ++++++++++++++++++++++++--------------- 1 file changed, 75 insertions(+), 48 deletions(-) diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 2d33613c41f..37e6f1b63fd 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -197,15 +197,16 @@ static int server_limit = 0; /* ServerLimit */ static int thread_limit = 0; /* ThreadLimit */ static int conns_this_child = 0; /* MaxConnectionsPerChild, only accessed in listener thread */ -static volatile int dying = 0; -static volatile int workers_may_exit = 0; -static volatile int start_thread_may_exit = 0; -static volatile int listener_may_exit = 0; -static apr_uint32_t connection_count = 0; /* Number of open connections */ -static apr_uint32_t timers_count = 0; /* Number of queued timers */ -static apr_uint32_t suspended_count = 0; /* Number of suspended connections */ -static apr_uint32_t threads_shutdown = 0; /* Number of threads that have shutdown - early during graceful termination */ +static /*atomic*/ apr_uint32_t dying = 0; +static /*atomic*/ apr_uint32_t workers_may_exit = 0; +static /*atomic*/ apr_uint32_t start_thread_may_exit = 0; +static /*atomic*/ apr_uint32_t listener_may_exit = 0; +static /*atomic*/ apr_uint32_t connection_count = 0; /* Number of open connections */ +static /*atomic*/ apr_uint32_t timers_count = 0; /* Number of queued timers */ +static /*atomic*/ apr_uint32_t suspended_count = 0; /* Number of suspended connections */ +static /*atomic*/ apr_uint32_t threads_shutdown = 0; /* Number of threads that have shutdown + early during graceful termination */ + static int had_healthy_child = 0; static int resource_shortage = 0; @@ -481,9 +482,14 @@ static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *cs) cs->q = q; cs->queue_timestamp = event_time_now(); APR_RING_INSERT_TAIL(&q->head, cs, event_conn_state_t, timeout_list); - ++*q->total; ++q->count; + /* Use atomic_set to be ordered/consistent with potential atomic reads + * outside the critical section, but writes are protected so a more + * expensive atomic_inc is not needed. + */ + apr_atomic_set32(q->total, *q->total + 1); + /* Cheaply update the global queues_next_expiry with the one of the * first entry of this queue (oldest) if it expires before. */ @@ -506,8 +512,13 @@ static void TO_QUEUE_REMOVE(struct timeout_queue *q, event_conn_state_t *cs) APR_RING_REMOVE(cs, timeout_list); APR_RING_ELEM_INIT(cs, timeout_list); - --*q->total; --q->count; + + /* Use atomic_set to be ordered/consistent with potential atomic reads + * outside the critical section, but writes are protected so a more + * expensive atomic_dec is not needed. + */ + apr_atomic_set32(q->total, *q->total - 1); } static struct timeout_queue *TO_QUEUE_MAKE(apr_pool_t *p, @@ -717,6 +728,7 @@ static /*atomic*/ apr_uint32_t listensocks_off = 0; static int disable_listensocks(void) { + volatile process_score *ps; int i; if (apr_atomic_cas32(&listensocks_off, 1, 0) != 0) { @@ -738,7 +750,8 @@ static int disable_listensocks(void) apr_atomic_read32(&timers_count), apr_atomic_read32(&suspended_count)); - ap_scoreboard_image->parent[ap_child_slot].not_accepting = 1; + ps = &ap_scoreboard_image->parent[ap_child_slot]; + ps->not_accepting = 1; for (i = 0; i < num_listensocks; i++) { apr_pollset_remove(event_pollset, &listener_pollfd[i]); @@ -748,9 +761,10 @@ static int disable_listensocks(void) static int enable_listensocks(void) { + volatile process_score *ps; int i; - if (listener_may_exit + if (apr_atomic_read32(&dying) || apr_atomic_cas32(&listensocks_off, 0, 1) != 1) { return 0; } @@ -774,7 +788,8 @@ static int enable_listensocks(void) * XXX: This is not yet optimal. If many workers suddenly become available, * XXX: the parent may kill some processes off too soon. */ - ap_scoreboard_image->parent[ap_child_slot].not_accepting = 0; + ps = &ap_scoreboard_image->parent[ap_child_slot]; + ps->not_accepting = 0; for (i = 0; i < num_listensocks; i++) { apr_pollset_add(event_pollset, &listener_pollfd[i]); @@ -809,7 +824,9 @@ static APR_INLINE int connections_above_limit(int *busy) static APR_INLINE int should_enable_listensocks(void) { - return !dying && listensocks_disabled() && !connections_above_limit(NULL); + return (listensocks_disabled() + && !apr_atomic_read32(&dying) + && !connections_above_limit(NULL)); } static void close_socket_at(apr_socket_t *csd, @@ -855,10 +872,9 @@ static void shutdown_listener(void) { ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, "shutting down listener%s", - listener_may_exit ? " again" : ""); + apr_atomic_read32(&listener_may_exit) ? " again" : ""); - listener_may_exit = 1; - disable_listensocks(); + apr_atomic_set32(&listener_may_exit, 1); /* Unblock the listener if it's poll()ing */ if (event_pollset && listener_is_wakeable) { @@ -914,7 +930,7 @@ static void signal_threads(int mode) * workers to exit once it has stopped accepting new connections */ if (mode == ST_UNGRACEFUL) { - workers_may_exit = 1; + apr_atomic_set32(&workers_may_exit, 1); ap_queue_interrupt_all(worker_queue); close_worker_sockets(); /* forcefully kill all current connections */ } @@ -993,7 +1009,7 @@ static int event_query(int query_code, int *result, apr_status_t *rv) static void event_note_child_stopped(int slot, pid_t pid, ap_generation_t gen) { if (slot != -1) { /* child had a scoreboard slot? */ - process_score *ps = &ap_scoreboard_image->parent[slot]; + volatile process_score *ps = &ap_scoreboard_image->parent[slot]; int i; pid = ps->pid; @@ -1079,8 +1095,9 @@ static int child_fatal; static apr_status_t decrement_connection_count(void *cs_) { - int is_last_connection; event_conn_state_t *cs = cs_; + int is_last_connection, is_dying; + ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf, "connection %" CS_FMT_TO " cleaned up", CS_ARG_TO(cs)); @@ -1097,12 +1114,13 @@ static apr_status_t decrement_connection_count(void *cs_) * now accept new connections. */ is_last_connection = !apr_atomic_dec32(&connection_count); + is_dying = apr_atomic_read32(&dying); if (listener_is_wakeable - && ((is_last_connection && listener_may_exit) + && ((is_last_connection && is_dying) || should_enable_listensocks())) { apr_pollset_wakeup(event_pollset); } - if (dying) { + if (is_dying) { /* Help worker_thread_should_exit_early() */ ap_queue_interrupt_one(worker_queue); } @@ -1325,7 +1343,7 @@ static int pollset_add_at(event_conn_state_t *cs, int sense, } /* close_worker_sockets() may have closed it already */ - if (workers_may_exit) { + if (apr_atomic_read32(&workers_may_exit)) { AP_DEBUG_ASSERT(APR_STATUS_IS_EBADF(rv)); } else { @@ -1742,10 +1760,14 @@ static void check_infinite_requests(void) static void set_child_dying(void) { + volatile process_score *ps; + ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, "quiescing"); + ps = &ap_scoreboard_image->parent[ap_child_slot]; + ps->quiescing = 1; - dying = 1; - ap_scoreboard_image->parent[ap_child_slot].quiescing = 1; + apr_atomic_set32(&dying, 1); + disable_listensocks(); /* definitively with dying = 1 */ ap_close_listeners_ex(my_bucket->listeners); #if 0 @@ -2340,7 +2362,7 @@ static APR_INLINE void shrink_timeout_queue(struct timeout_queue *queue, if (count) { ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, "All workers are %s, %s queue shrinked (%u done, %u left)", - dying ? "dying" : "busy", queue->name, + apr_atomic_read32(&dying) ? "dying" : "busy", queue->name, count, apr_atomic_read32(queue->total)); } } @@ -2384,8 +2406,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) now = poll_time = event_time_now(); - if (listener_may_exit) { - int once = !dying; + if (apr_atomic_read32(&listener_may_exit)) { + int once = !apr_atomic_read32(&dying); if (once) { set_child_dying(); } @@ -2519,7 +2541,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) timers_next_expiry ? timers_next_expiry - now : 0, listensocks_disabled() ? "no" : "yes", apr_atomic_read32(&connection_count), - listener_may_exit, dying); + apr_atomic_read32(&listener_may_exit), + apr_atomic_read32(&dying)); rc = apr_pollset_poll(event_pollset, timeout, &num, &out_pfd); if (rc != APR_SUCCESS) { @@ -2554,7 +2577,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) timers_next_expiry ? timers_next_expiry - now : 0, listensocks_disabled() ? "no" : "yes", apr_atomic_read32(&connection_count), - listener_may_exit, dying); + apr_atomic_read32(&listener_may_exit), + apr_atomic_read32(&dying)); for (user_chain = NULL; num > 0; --num, ++out_pfd) { listener_poll_type *pt = out_pfd->client_data; @@ -2601,7 +2625,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) */ continue; } - if (!dying) { + if (!apr_atomic_read32(&dying)) { void *csd = NULL; ap_listen_rec *lr = (ap_listen_rec *) pt->baton; apr_pool_t *ptrans; /* Pool for per-transaction stuff */ @@ -2744,7 +2768,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) process_timeout_queue(keepalive_q, now); /* The linger queue can be shrinked any time under pressure */ - if (workers_were_busy || dying) { + if (workers_were_busy || apr_atomic_read32(&dying)) { shrink_timeout_queue(linger_q, now); next_shrink_time = now + QUEUES_SHRINK_TIMEOUT; } @@ -2778,7 +2802,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) ps->connections = apr_atomic_read32(&connection_count); } else if (next_shrink_time <= now - && (workers_were_busy || dying) + && (workers_were_busy || apr_atomic_read32(&dying)) && apr_atomic_read32(linger_q->total)) { apr_thread_mutex_lock(timeout_mutex); shrink_timeout_queue(linger_q, now); @@ -2870,17 +2894,18 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy) } ap_update_child_status_from_indexes(process_slot, thread_slot, - dying ? SERVER_GRACEFUL - : SERVER_READY, + (apr_atomic_read32(&dying) + ? SERVER_GRACEFUL : SERVER_READY), NULL); - if (workers_may_exit) { + if (apr_atomic_read32(&workers_may_exit)) { ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf, "worker thread %i/%i may exit", thread_slot, threads_per_child); break; } - if (dying && worker_thread_should_exit_early(thread_slot)) { + if (apr_atomic_read32(&dying) + && worker_thread_should_exit_early(thread_slot)) { break; } @@ -2907,7 +2932,7 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy) * may have already been cleaned up. Don't log the "error" if * workers_may_exit is set. */ - if (!APR_STATUS_IS_EINTR(rv) && !workers_may_exit) { + if (!APR_STATUS_IS_EINTR(rv) && !apr_atomic_read32(&workers_may_exit)) { ap_log_error(APLOG_MARK, APLOG_CRIT, rv, ap_server_conf, APLOGNO(03099) "ap_queue_pop_event failed"); AP_DEBUG_ASSERT(0); @@ -2966,8 +2991,8 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy) } ap_update_child_status_from_indexes(process_slot, thread_slot, - dying ? SERVER_DEAD - : SERVER_GRACEFUL, + (apr_atomic_read32(&dying) + ? SERVER_DEAD : SERVER_GRACEFUL), NULL); apr_thread_exit(thd, APR_SUCCESS); @@ -3240,7 +3265,8 @@ static void *APR_THREAD_FUNC start_threads(apr_thread_t * thd, void *dummy) } - if (start_thread_may_exit || threads_created == threads_per_child) { + if (apr_atomic_read32(&start_thread_may_exit) + || threads_created == threads_per_child) { break; } /* wait for previous generation to clean up an entry */ @@ -3290,9 +3316,9 @@ static void join_workers(apr_thread_t * listener, apr_thread_t ** threads) */ iter = 0; - while (!dying) { + while (!apr_atomic_read32(&dying)) { apr_sleep(apr_time_from_msec(500)); - if (dying || ++iter > 10) { + if (apr_atomic_read32(&dying) || ++iter > 10) { break; } /* listener has not stopped accepting yet */ @@ -3332,10 +3358,11 @@ static void join_start_thread(apr_thread_t * start_thread_id) { apr_status_t rv, thread_rv; - start_thread_may_exit = 1; /* tell it to give up in case it is still - * trying to take over slots from a - * previous generation - */ + /* tell it to give up in case it is still trying to take over slots + * from a previous generation + */ + apr_atomic_set32(&start_thread_may_exit, 1); + rv = apr_thread_join(&thread_rv, start_thread_id); if (rv != APR_SUCCESS) { ap_log_error(APLOG_MARK, APLOG_CRIT, rv, ap_server_conf, APLOGNO(00478) From 143a83e09ba4496cfe6e41e4eb003ce17a76ce2d Mon Sep 17 00:00:00 2001 From: ylavic Date: Mon, 3 Jun 2024 16:47:50 +0200 Subject: [PATCH 16/22] mpm_event: Periodic scoreboard stats update (1s). --- server/mpm/event/event.c | 115 +++++++++++++++++++++++++++------------ 1 file changed, 79 insertions(+), 36 deletions(-) diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 37e6f1b63fd..7141c46ce87 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -161,6 +161,9 @@ /* Shrink linger_q at this period (min) when busy */ #define QUEUES_SHRINK_TIMEOUT apr_time_from_msec(500) +/* Update scoreboard stats at this period */ +#define STATS_UPDATE_TIMEOUT apr_time_from_msec(1000) + /* Don't wait more time in poll() if APR_POLLSET_WAKEABLE is not implemented */ #define NON_WAKEABLE_TIMEOUT apr_time_from_msec(100) @@ -2367,15 +2370,53 @@ static APR_INLINE void shrink_timeout_queue(struct timeout_queue *queue, } } +static void update_stats(process_score *ps, apr_time_t now, + apr_time_t *when, int force) +{ + int expired = (*when <= now); + + if (expired || force) { + apr_atomic_set32(&ps->wait_io, apr_atomic_read32(waitio_q->total)); + apr_atomic_set32(&ps->write_completion, apr_atomic_read32(write_completion_q->total)); + apr_atomic_set32(&ps->keep_alive, apr_atomic_read32(keepalive_q->total)); + apr_atomic_set32(&ps->shutdown, apr_atomic_read32(shutdown_q->total)); + apr_atomic_set32(&ps->lingering_close, apr_atomic_read32(linger_q->total)); + apr_atomic_set32(&ps->backlog, apr_atomic_read32(backlog_q->total)); + apr_atomic_set32(&ps->suspended, apr_atomic_read32(&suspended_count)); + apr_atomic_set32(&ps->connections, apr_atomic_read32(&connection_count)); + } + + if (expired) { + ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, + "child: idlers:%i conns:%u backlog:%u " + "waitio:%u write:%u keepalive:%u shutdown:%u linger:%u " + "timers:%u suspended:%u (%u/%u workers shutdown)", + ap_queue_info_idlers_count(worker_queue_info), + apr_atomic_read32(&connection_count), + apr_atomic_read32(backlog_q->total), + apr_atomic_read32(waitio_q->total), + apr_atomic_read32(write_completion_q->total), + apr_atomic_read32(keepalive_q->total), + apr_atomic_read32(shutdown_q->total), + apr_atomic_read32(linger_q->total), + apr_atomic_read32(&timers_count), + apr_atomic_read32(&suspended_count), + apr_atomic_read32(&threads_shutdown), + threads_per_child); + + *when = now + STATS_UPDATE_TIMEOUT; + } +} + static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) { apr_status_t rc; proc_info *ti = dummy; int process_slot = ti->pslot; process_score *ps = ap_get_scoreboard_process(process_slot); - apr_time_t last_log, next_shrink_time = 0; + apr_time_t next_stats_time = 0, next_shrink_time = 0; + apr_interval_time_t min_poll_timeout = -1; - last_log = event_time_now(); free(ti); #if HAVE_SERF @@ -2388,11 +2429,21 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) apr_signal(LISTENER_SIGNAL, dummy_signal_handler); unblock_signal(LISTENER_SIGNAL); + /* Don't wait in poll() for more than NON_WAKEABLE_TIMEOUT if the pollset + * is not wakeable, and not more then the stats update period either. + */ + if (!listener_is_wakeable) { + min_poll_timeout = NON_WAKEABLE_TIMEOUT; + } + if (min_poll_timeout < 0 || min_poll_timeout > STATS_UPDATE_TIMEOUT) { + min_poll_timeout = STATS_UPDATE_TIMEOUT; + } + for (;;) { apr_int32_t num = 0; apr_time_t next_expiry = -1; apr_interval_time_t timeout = -1; - int workers_were_busy = 0; + int workers_were_busy = 0, force_stats = 0; socket_callback_baton_t *user_chain; const apr_pollfd_t *out_pfd; apr_time_t now, poll_time; @@ -2426,22 +2477,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) } } - /* trace log status every second */ - if (APLOGtrace6(ap_server_conf) && now - last_log > apr_time_from_sec(1)) { - ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf, - "connections: %u (waitio:%d write:%d keepalive:%d " - "lingering:%d suspended:%u), workers: %u/%u shutdown", - apr_atomic_read32(&connection_count), - apr_atomic_read32(waitio_q->total), - apr_atomic_read32(write_completion_q->total), - apr_atomic_read32(keepalive_q->total), - apr_atomic_read32(linger_q->total), - apr_atomic_read32(&suspended_count), - apr_atomic_read32(&threads_shutdown), - threads_per_child); - last_log = now; - } - #if HAVE_SERF rc = serf_context_prerun(g_serf); if (rc != APR_SUCCESS) { @@ -2512,15 +2547,32 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) timeout = next_expiry > now ? next_expiry - now : 0; } - /* When non-wakeable, don't wait more than 100 ms, in any case. */ - if (!listener_is_wakeable && (timeout < 0 || timeout > NON_WAKEABLE_TIMEOUT)) { - timeout = NON_WAKEABLE_TIMEOUT; + /* So long as there are connections, wake up at most every + * min_poll_timeout to refresh the scoreboard stats. + */ + if (timeout < 0 || timeout > min_poll_timeout) { + if (timeout > 0 + || !listener_is_wakeable + || apr_atomic_read32(&connection_count)) { + timeout = next_stats_time - now; + if (timeout <= 0 || timeout > min_poll_timeout) { + timeout = min_poll_timeout; + } + } + else { + /* No connections and entering infinite poll(), + * clear the stats first. + */ + force_stats = 1; + } } - else if (timeout > 0) { - /* apr_pollset_poll() might round down the timeout to - * milliseconds, let's forcibly round up here to never - * return before the timeout. - */ + update_stats(ps, now, &next_stats_time, force_stats); + + /* apr_pollset_poll() might round down the timeout to + * milliseconds, let's forcibly round up here to never + * return before the timeout. + */ + if (timeout > 0) { timeout = apr_time_from_msec( apr_time_as_msec(timeout + apr_time_from_msec(1) - 1) ); @@ -2791,15 +2843,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf, "queues maintained: next timeout=%" APR_TIME_T_FMT, next_expiry ? next_expiry - now : -1); - - ps->wait_io = apr_atomic_read32(waitio_q->total); - ps->write_completion = apr_atomic_read32(write_completion_q->total); - ps->keep_alive = apr_atomic_read32(keepalive_q->total); - ps->shutdown = apr_atomic_read32(shutdown_q->total); - ps->lingering_close = apr_atomic_read32(linger_q->total); - ps->backlog = apr_atomic_read32(backlog_q->total); - ps->suspended = apr_atomic_read32(&suspended_count); - ps->connections = apr_atomic_read32(&connection_count); } else if (next_shrink_time <= now && (workers_were_busy || apr_atomic_read32(&dying)) From fccc1622e5b321815e57c7304cdee27f736f4211 Mon Sep 17 00:00:00 2001 From: ylavic Date: Tue, 27 Jun 2023 05:33:34 +0200 Subject: [PATCH 17/22] mpm_event: Autotuning from MaxRequestWorkers. --- server/mpm/event/event.c | 494 ++++++++++++++++++++++++++------------- 1 file changed, 329 insertions(+), 165 deletions(-) diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 7141c46ce87..3007dc8b33b 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -149,6 +149,21 @@ #define MAX_THREAD_LIMIT 100000 #endif +#ifndef DEFAULT_ASYNC_FACTOR +#define DEFAULT_ASYNC_FACTOR 2.0 +#endif + +#ifndef MAX_SPARE_THREADS_RATIO +#define MAX_SPARE_THREADS_RATIO 0.75 /* of MaxRequestWorkers */ +#endif +#ifndef MAX_DAEMONS_THREADS_RATIO +#define MAX_DAEMONS_THREADS_RATIO 32 +#endif + +#ifndef SCOREBOARD_DAEMONS_FACTOR +#define SCOREBOARD_DAEMONS_FACTOR 4 +#endif + #define MPM_CHILD_PID(i) (ap_scoreboard_image->parent[i].pid) #if !APR_VERSION_AT_LEAST(1,4,0) @@ -183,13 +198,6 @@ * Actual definitions of config globals */ -#ifndef DEFAULT_WORKER_FACTOR -#define DEFAULT_WORKER_FACTOR 2 -#endif -#define WORKER_FACTOR_SCALE 16 /* scale factor to allow fractional values */ -static unsigned int worker_factor = DEFAULT_WORKER_FACTOR * WORKER_FACTOR_SCALE; - /* AsyncRequestWorkerFactor * 16 */ - static int threads_per_child = 0; /* ThreadsPerChild */ static int ap_daemons_to_start = 0; /* StartServers */ static int min_spare_threads = 0; /* MinSpareThreads */ @@ -200,6 +208,12 @@ static int server_limit = 0; /* ServerLimit */ static int thread_limit = 0; /* ThreadLimit */ static int conns_this_child = 0; /* MaxConnectionsPerChild, only accessed in listener thread */ +static double async_factor = DEFAULT_ASYNC_FACTOR; /* AsyncRequestWorkerFactor */ + +static int auto_settings = 0; /* Auto settings based on max_workers + and num_online_cpus */ +static int num_online_cpus = 0; /* Number of CPUs detected */ + static /*atomic*/ apr_uint32_t dying = 0; static /*atomic*/ apr_uint32_t workers_may_exit = 0; static /*atomic*/ apr_uint32_t start_thread_may_exit = 0; @@ -627,11 +641,16 @@ typedef struct event_retained_data { apr_pool_t *gen_pool; /* generation pool (children start->stop lifetime) */ event_child_bucket *buckets; /* children buckets (reset per generation) */ + ap_listen_rec **listen_buckets; + int num_listen_buckets; + int first_server_limit; int first_thread_limit; + int first_server_sb_limit; int sick_child_detected; int maxclients_reported; int near_maxclients_reported; + /* * The max child slot ever assigned, preserved across restarts. Necessary * to deal with MaxRequestWorkers changes across AP_SIG_GRACEFUL restarts. @@ -815,7 +834,7 @@ static APR_INLINE int connections_above_limit(int *busy) /* Off by 'listensocks_disabled()' to avoid flip flop */ || c_count - l_count < (apr_uint32_t)threads_per_child + (i_count - listensocks_disabled()) * - (worker_factor / WORKER_FACTOR_SCALE)) { + async_factor) { return 0; } } @@ -3082,11 +3101,12 @@ static void setup_threads_runtime(void) const int good_methods[] = { APR_POLLSET_PORT, APR_POLLSET_KQUEUE, APR_POLLSET_EPOLL }; - /* XXX: K-A or lingering close connection included in the async factor */ - unsigned int async_factor = (worker_factor < WORKER_FACTOR_SCALE * 2 - ? WORKER_FACTOR_SCALE * 2 : worker_factor); - unsigned int async_threads = (threads_per_child * async_factor / WORKER_FACTOR_SCALE); - const apr_size_t pollset_size = (num_listensocks + async_threads + POLLSET_RESERVE_SIZE); + const double threads_factor = (async_factor < DEFAULT_ASYNC_FACTOR + ? DEFAULT_ASYNC_FACTOR + : async_factor); + const apr_size_t pollset_size = ((unsigned int)(threads_per_child * threads_factor) + + (unsigned int)num_listensocks + + POLLSET_RESERVE_SIZE); int pollset_flags; /* Event's skiplist operations will happen concurrently with other modules' @@ -4063,76 +4083,27 @@ static void server_main_loop(int remaining_children_to_start) static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s) { - ap_listen_rec **listen_buckets = NULL; - int num_buckets = retained->mpm->num_buckets; int remaining_children_to_start; + int num_buckets, i; apr_status_t rv; - int i; ap_log_pid(pconf, ap_pid_fname); - /* On first startup create gen_pool to satisfy the lifetime of the - * parent's PODs and listeners; on restart stop the children from the - * previous generation and clear gen_pool for the next one. - */ - if (!retained->gen_pool) { - apr_pool_create(&retained->gen_pool, ap_pglobal); - } - else { - if (retained->mpm->was_graceful) { - /* wake up the children...time to die. But we'll have more soon */ - for (i = 0; i < num_buckets; i++) { - ap_mpm_podx_killpg(retained->buckets[i].pod, - active_daemons_limit, AP_MPM_PODX_GRACEFUL); - } - } - else { - /* Kill 'em all. Since the child acts the same on the parents SIGTERM - * and a SIGHUP, we may as well use the same signal, because some user - * pthreads are stealing signals from us left and right. - */ - for (i = 0; i < num_buckets; i++) { - ap_mpm_podx_killpg(retained->buckets[i].pod, - active_daemons_limit, AP_MPM_PODX_RESTART); - } - ap_reclaim_child_processes(1, /* Start with SIGTERM */ - event_note_child_stopped); - } - apr_pool_clear(retained->gen_pool); - retained->buckets = NULL; - - /* advance to the next generation */ - /* XXX: we really need to make sure this new generation number isn't in - * use by any of the previous children. - */ - ++retained->mpm->my_generation; - } - - /* On graceful restart, preserve the scoreboard and the listeners buckets. - * When ungraceful, clear the scoreboard and set num_buckets to zero to let - * ap_duplicate_listeners() below determine how many are needed/configured. - */ - if (!retained->mpm->was_graceful) { - if (ap_run_pre_mpm(s->process->pool, SB_SHARED) != OK) { - retained->mpm->mpm_state = AP_MPMQ_STOPPING; - return !OK; - } - num_buckets = (one_process) ? 1 : 0; /* one_process => one bucket */ - retained->mpm->num_buckets = 0; /* reset idle_spawn_rate below */ + /* Preserve the scoreboard on graceful restart, reset when ungraceful */ + if (!retained->mpm->was_graceful + && ap_run_pre_mpm(s->process->pool, SB_SHARED)) { + retained->mpm->mpm_state = AP_MPMQ_STOPPING; + return !OK; } /* Now on for the new generation. */ ap_scoreboard_image->global->running_generation = retained->mpm->my_generation; ap_unixd_mpm_set_signals(pconf, one_process); - if ((rv = ap_duplicate_listeners(retained->gen_pool, ap_server_conf, - &listen_buckets, &num_buckets))) { - ap_log_error(APLOG_MARK, APLOG_CRIT, rv, - ap_server_conf, APLOGNO(03273) - "could not duplicate listeners"); - return !OK; - } - + /* Set the buckets listeners from the listen_buckets initialized + * in event_open_logs(). + */ + num_buckets = retained->num_listen_buckets; retained->buckets = apr_pcalloc(retained->gen_pool, num_buckets * sizeof(event_child_bucket)); for (i = 0; i < num_buckets; i++) { @@ -4144,8 +4115,11 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s) "could not open pipe-of-death"); return !OK; } - retained->buckets[i].listeners = listen_buckets[i]; + retained->buckets[i].listeners = retained->listen_buckets[i]; } + /* Reset for the next generation/restart */ + retained->listen_buckets = NULL; + retained->num_listen_buckets = 0; /* If num_buckets changed, adjust max_spawn_rate and the free_slots buffer */ if (retained->mpm->num_buckets != num_buckets) { @@ -4178,23 +4152,14 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s) retained->mpm->num_buckets = num_buckets; } - /* Don't thrash since num_buckets depends on the - * system and the number of online CPU cores... - */ - if (active_daemons_limit < num_buckets) - active_daemons_limit = num_buckets; - if (ap_daemons_to_start < num_buckets) - ap_daemons_to_start = num_buckets; - /* We want to create as much children at a time as the number of buckets, - * so to optimally accept connections (evenly distributed across buckets). - * Thus min_spare_threads should at least maintain num_buckets children, - * and max_spare_threads allow num_buckets more children w/o triggering - * immediately (e.g. num_buckets idle threads margin, one per bucket). - */ - if (min_spare_threads < threads_per_child * (num_buckets - 1) + num_buckets) - min_spare_threads = threads_per_child * (num_buckets - 1) + num_buckets; - if (max_spare_threads < min_spare_threads + (threads_per_child + 1) * num_buckets) - max_spare_threads = min_spare_threads + (threads_per_child + 1) * num_buckets; + ap_log_error(APLOG_MARK, APLOG_INFO, 0, ap_server_conf, APLOGNO(10464) + "MPM event settings%s: MaxRequestWorkers=%d AsyncRequestWorkerFactor=%.1lf " + "ThreadsPerChild=%d ThreadLimit=%d MinSpareThreads=%d MaxSpareThreads=%d " + "ServerLimit=%d/%d StartServers=%d Buckets=%d CPUs=%d", + auto_settings ? " (auto)" : "", max_workers, async_factor, + threads_per_child, thread_limit, min_spare_threads, max_spare_threads, + active_daemons_limit, server_limit, ap_daemons_to_start, + num_buckets, num_online_cpus); /* If we're doing a graceful_restart then we're going to see a lot * of children exiting immediately when we get into the main loop @@ -4382,12 +4347,18 @@ static int event_protocol_switch(conn_rec *c, request_rec *r, server_rec *s, /* This really should be a post_config hook, but the error log is already * redirected by that point, so we need to do this in the open_logs phase. + * We compute num_buckets here too, thus the definitive AP_MPMQ_* settings + * which need it and which may be needed by the post_config hooks of other + * modules. */ static int event_open_logs(apr_pool_t * p, apr_pool_t * plog, apr_pool_t * ptemp, server_rec * s) { int startup = 0; int level_flags = 0; + int num_buckets = 0, i; + int min_threads; + apr_status_t rv; pconf = p; @@ -4408,6 +4379,152 @@ static int event_open_logs(apr_pool_t * p, apr_pool_t * plog, return !OK; } + /* On first startup create gen_pool to satisfy the lifetime of the + * parent's PODs and listeners; on restart stop the children from the + * previous generation and clear gen_pool for the next one. + */ + if (!retained->gen_pool) { + apr_pool_create(&retained->gen_pool, ap_pglobal); + } + else { + num_buckets = retained->mpm->num_buckets; + if (retained->mpm->was_graceful) { + /* wake up the children...time to die. But we'll have more soon */ + for (i = 0; i < num_buckets; i++) { + ap_mpm_podx_killpg(retained->buckets[i].pod, + active_daemons_limit, AP_MPM_PODX_GRACEFUL); + } + } + else { + /* Kill 'em all. Since the child acts the same on the parents SIGTERM + * and a SIGHUP, we may as well use the same signal, because some user + * pthreads are stealing signals from us left and right. + */ + for (i = 0; i < num_buckets; i++) { + ap_mpm_podx_killpg(retained->buckets[i].pod, + active_daemons_limit, AP_MPM_PODX_RESTART); + } + ap_reclaim_child_processes(1, /* Start with SIGTERM */ + event_note_child_stopped); + } + apr_pool_clear(retained->gen_pool); + retained->buckets = NULL; + + /* advance to the next generation */ + /* XXX: we really need to make sure this new generation number isn't in + * use by any of the previous children. + */ + ++retained->mpm->my_generation; + } + + /* On graceful restart, preserve the listeners buckets. When ungraceful, + * set num_buckets to zero to let ap_duplicate_listeners() below determine + * how many are needed/configured. + */ + if (!retained->mpm->was_graceful) { + num_buckets = (one_process) ? 1 : 0; /* one_process => one bucket */ + retained->mpm->num_buckets = 0; /* old gen's until event_run() */ + } + if ((rv = ap_duplicate_listeners(retained->gen_pool, ap_server_conf, + &retained->listen_buckets, + &num_buckets))) { + ap_log_error(APLOG_MARK, APLOG_ALERT | level_flags, rv, + (startup ? NULL : s), APLOGNO(03273) + "could not duplicate listeners, shutting down"); + return !OK; + } + retained->num_listen_buckets = num_buckets; + + /* Don't thrash since num_buckets depends on the system and the + * number of CPU cores, so make the settings consistent. + */ + if (retained->first_thread_limit) { + if (threads_per_child > retained->first_thread_limit) { + ap_log_error(APLOG_MARK, APLOG_WARNING, 0, s, APLOGNO(10465) + "ThreadsPerChild (%d) exceeds initial ThreadLimit, " + "forcing ThreadsPerChild to %d", + threads_per_child, retained->first_thread_limit); + threads_per_child = retained->first_thread_limit; + } + } + else { + if (thread_limit < threads_per_child) { + thread_limit = threads_per_child; + } + retained->first_thread_limit = thread_limit; + } + min_threads = threads_per_child * num_buckets; + if (max_workers < min_threads) { + max_workers = min_threads; + } + else { + max_workers = (max_workers / min_threads) * min_threads; + } + active_daemons_limit = max_workers / threads_per_child; + if (retained->first_server_limit) { + if (active_daemons_limit > retained->first_server_sb_limit) { + int new_max_workers = retained->first_server_sb_limit * threads_per_child; + if (new_max_workers < min_threads) { + new_max_workers = min_threads; + } + else { + new_max_workers = (new_max_workers / min_threads) * min_threads; + } + ap_log_error(APLOG_MARK, APLOG_WARNING, 0, s, APLOGNO(10466) + "MaxRequestWorkers (%d) / ThreadsPerChild (%d) would " + "exceed initial scoreboard limit (%d), forcing " + "MaxRequestWorkers to %d", + max_workers, threads_per_child, + retained->first_server_sb_limit, + new_max_workers); + max_workers = new_max_workers; + active_daemons_limit = retained->first_server_sb_limit; + } + server_limit = retained->first_server_sb_limit; + } + else { + /* Save the initial ServerLimit which cannot be changed on restart, but + * leave some spare room in the actual server_[sb_]limit (used to size + * the scoreboard) to allow for children restarting while the old gen + * is gracefully exiting. + */ + retained->first_server_limit = server_limit; + if (server_limit < active_daemons_limit * SCOREBOARD_DAEMONS_FACTOR) { + server_limit = active_daemons_limit * SCOREBOARD_DAEMONS_FACTOR; + } + retained->first_server_sb_limit = server_limit; + } + if (ap_daemons_to_start < num_buckets) { + ap_daemons_to_start = num_buckets; + } + else if (ap_daemons_to_start < active_daemons_limit) { + ap_daemons_to_start = (ap_daemons_to_start / num_buckets) * num_buckets; + } + else { + ap_daemons_to_start = active_daemons_limit; + } + if (min_spare_threads < ap_daemons_to_start * threads_per_child) { + min_spare_threads = ap_daemons_to_start * threads_per_child; + } + else if (min_spare_threads < max_workers) { + min_spare_threads = (min_spare_threads / min_threads) * min_threads; + } + else { + min_spare_threads = max_workers; + } + if (max_spare_threads < 0) { /* auto settings */ + max_spare_threads = max_workers * MAX_SPARE_THREADS_RATIO; + } + if (max_spare_threads < min_spare_threads + min_threads) { + max_spare_threads = min_spare_threads + min_threads; + } + else if (max_spare_threads < max_workers) { + max_spare_threads = (max_spare_threads / min_threads) * min_threads; + } + else { + max_spare_threads = max_workers; + } + return OK; } @@ -4465,7 +4582,8 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog, /* sigh, want this only the second time around */ if (retained->mpm->module_loads == 2) { - rv = apr_pollset_create(&event_pollset, 1, plog, + apr_pollset_t *tmp = NULL; + rv = apr_pollset_create(&tmp, 1, plog, APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY); if (rv != APR_SUCCESS) { ap_log_error(APLOG_MARK, APLOG_CRIT, rv, NULL, APLOGNO(00495) @@ -4474,7 +4592,7 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog, "Also check system or user limits!"); return HTTP_INTERNAL_SERVER_ERROR; } - apr_pollset_destroy(event_pollset); + apr_pollset_destroy(tmp); if (!one_process && !foreground) { /* before we detach, setup crash handlers to log to errorlog */ @@ -4492,21 +4610,25 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog, parent_pid = ap_my_pid = getpid(); ap_listen_pre_config(); - ap_daemons_to_start = DEFAULT_START_DAEMON; - min_spare_threads = DEFAULT_MIN_FREE_DAEMON * DEFAULT_THREADS_PER_CHILD; - max_spare_threads = DEFAULT_MAX_FREE_DAEMON * DEFAULT_THREADS_PER_CHILD; - server_limit = DEFAULT_SERVER_LIMIT; - thread_limit = DEFAULT_THREAD_LIMIT; - active_daemons_limit = server_limit; - threads_per_child = DEFAULT_THREADS_PER_CHILD; - max_workers = active_daemons_limit * threads_per_child; had_healthy_child = 0; ap_extended_status = 0; - event_pollset = NULL; - worker_queue_info = NULL; - listener_os_thread = NULL; - listener_is_wakeable = 0; + max_workers = -1; + threads_per_child = -1; + min_spare_threads = max_spare_threads = -1; + server_limit = thread_limit = -1; + ap_daemons_to_start = -1; + auto_settings = 0; + +#ifndef _SC_NPROCESSORS_ONLN + num_online_cpus = 1; +#else + num_online_cpus = sysconf(_SC_NPROCESSORS_ONLN); + if (num_online_cpus < 1) { + num_online_cpus = 1; + } +#endif + async_factor = DEFAULT_ASYNC_FACTOR; return OK; } @@ -4563,7 +4685,10 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog, startup = 1; } - if (server_limit > MAX_SERVER_LIMIT) { + if (server_limit < 0) { + server_limit = DEFAULT_SERVER_LIMIT; + } + else if (server_limit > MAX_SERVER_LIMIT) { if (startup) { ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00497) "WARNING: ServerLimit of %d exceeds compile-time " @@ -4577,7 +4702,7 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog, } server_limit = MAX_SERVER_LIMIT; } - else if (server_limit < 1) { + else if (server_limit == 0) { if (startup) { ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00499) "WARNING: ServerLimit of %d not allowed, " @@ -4589,14 +4714,10 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog, } server_limit = 1; } - /* you cannot change ServerLimit across a restart; ignore * any such attempts */ - if (!retained->first_server_limit) { - retained->first_server_limit = server_limit; - } - else if (server_limit != retained->first_server_limit) { + if (retained->first_server_limit && server_limit != retained->first_server_limit) { /* don't need a startup console version here */ ap_log_error(APLOG_MARK, APLOG_WARNING, 0, s, APLOGNO(00501) "changing ServerLimit to %d from original value of %d " @@ -4605,7 +4726,10 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog, server_limit = retained->first_server_limit; } - if (thread_limit > MAX_THREAD_LIMIT) { + if (thread_limit < 0) { + thread_limit = DEFAULT_THREAD_LIMIT; + } + else if (thread_limit > MAX_THREAD_LIMIT) { if (startup) { ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00502) "WARNING: ThreadLimit of %d exceeds compile-time " @@ -4619,7 +4743,7 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog, } thread_limit = MAX_THREAD_LIMIT; } - else if (thread_limit < 1) { + else if (thread_limit == 0) { if (startup) { ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00504) "WARNING: ThreadLimit of %d not allowed, " @@ -4631,14 +4755,10 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog, } thread_limit = 1; } - /* you cannot change ThreadLimit across a restart; ignore * any such attempts */ - if (!retained->first_thread_limit) { - retained->first_thread_limit = thread_limit; - } - else if (thread_limit != retained->first_thread_limit) { + if (retained->first_thread_limit && thread_limit != retained->first_thread_limit) { /* don't need a startup console version here */ ap_log_error(APLOG_MARK, APLOG_WARNING, 0, s, APLOGNO(00506) "changing ThreadLimit to %d from original value of %d " @@ -4647,7 +4767,41 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog, thread_limit = retained->first_thread_limit; } - if (threads_per_child > thread_limit) { + /* Auto settings depend on max_workers and num_buckets, the latter being + * known in event_open_logs() only. So defer to there (with no warnings + * since it's somewhat auto..). + */ + if (auto_settings) { + if (max_workers <= 0) { + /* This used to warn before auto settings, just take the + * default value still but silently. + */ + max_workers = DEFAULT_SERVER_LIMIT * DEFAULT_THREADS_PER_CHILD; + } + if (threads_per_child <= 0) { + /* Default threads_per_child is the number of CPUs */ + threads_per_child = num_online_cpus; + + /* With a lot of workers and not so much CPUs to handle them, + * spawn more threads to get a reasonable active_daemons_limit + * i.e. processes / threads ratio. + */ + while (max_workers / threads_per_child > + threads_per_child * MAX_DAEMONS_THREADS_RATIO) { + threads_per_child *= 2; + } + } + return OK; /* => event_open_logs() */ + } + + /* No auto settings; use the default for anything not set (or set to + * some negative value), warn about nonsense values and adjust otherwise. + */ + + if (threads_per_child < 0) { + threads_per_child = DEFAULT_THREADS_PER_CHILD; + } + else if (threads_per_child > thread_limit) { if (startup) { ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00507) "WARNING: ThreadsPerChild of %d exceeds ThreadLimit " @@ -4662,7 +4816,7 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog, } threads_per_child = thread_limit; } - else if (threads_per_child < 1) { + else if (threads_per_child == 0) { if (startup) { ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00509) "WARNING: ThreadsPerChild of %d not allowed, " @@ -4675,7 +4829,10 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog, threads_per_child = 1; } - if (max_workers < threads_per_child) { + if (max_workers < 0) { + max_workers = DEFAULT_SERVER_LIMIT * DEFAULT_THREADS_PER_CHILD; + } + else if (max_workers < threads_per_child) { if (startup) { ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00511) "WARNING: MaxRequestWorkers of %d is less than " @@ -4693,27 +4850,6 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog, } active_daemons_limit = max_workers / threads_per_child; - - if (max_workers % threads_per_child) { - int tmp_max_workers = active_daemons_limit * threads_per_child; - - if (startup) { - ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00513) - "WARNING: MaxRequestWorkers of %d is not an integer " - "multiple of ThreadsPerChild of %d, decreasing to nearest " - "multiple %d, for a maximum of %d servers.", - max_workers, threads_per_child, tmp_max_workers, - active_daemons_limit); - } else { - ap_log_error(APLOG_MARK, APLOG_WARNING, 0, s, APLOGNO(00514) - "MaxRequestWorkers of %d is not an integer multiple " - "of ThreadsPerChild of %d, decreasing to nearest " - "multiple %d", max_workers, threads_per_child, - tmp_max_workers); - } - max_workers = tmp_max_workers; - } - if (active_daemons_limit > server_limit) { if (startup) { ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00515) @@ -4730,10 +4866,34 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog, server_limit * threads_per_child); } active_daemons_limit = server_limit; + max_workers = active_daemons_limit * threads_per_child; + } + else if (max_workers % threads_per_child) { + int new_max_workers = active_daemons_limit * threads_per_child; + if (startup) { + ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00513) + "WARNING: MaxRequestWorkers of %d is not an integer " + "multiple of ThreadsPerChild of %d, decreasing to nearest " + "multiple %d, for a maximum of %d servers.", + max_workers, threads_per_child, new_max_workers, + active_daemons_limit); + } else { + ap_log_error(APLOG_MARK, APLOG_WARNING, 0, s, APLOGNO(00514) + "MaxRequestWorkers of %d is not an integer multiple " + "of ThreadsPerChild of %d, decreasing to nearest " + "multiple %d", max_workers, threads_per_child, + new_max_workers); + } + max_workers = new_max_workers; } - /* ap_daemons_to_start > active_daemons_limit checked in ap_mpm_run() */ - if (ap_daemons_to_start < 1) { + if (ap_daemons_to_start < 0) { + ap_daemons_to_start = DEFAULT_START_DAEMON; + } + else if (ap_daemons_to_start > active_daemons_limit) { + ap_daemons_to_start = active_daemons_limit; + } + else if (ap_daemons_to_start == 0) { if (startup) { ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00517) "WARNING: StartServers of %d not allowed, " @@ -4746,7 +4906,10 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog, ap_daemons_to_start = 1; } - if (min_spare_threads < 1) { + if (min_spare_threads < 0) { + min_spare_threads = DEFAULT_MIN_FREE_DAEMON * DEFAULT_THREADS_PER_CHILD; + } + else if (min_spare_threads == 0) { if (startup) { ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00519) "WARNING: MinSpareThreads of %d not allowed, " @@ -4758,12 +4921,18 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog, "MinSpareThreads of %d not allowed, increasing to 1", min_spare_threads); } - min_spare_threads = 1; + min_spare_threads = threads_per_child; } - /* max_spare_threads < min_spare_threads + threads_per_child - * checked in ap_mpm_run() - */ + if (max_spare_threads < 0) { + max_spare_threads = DEFAULT_MAX_FREE_DAEMON * DEFAULT_THREADS_PER_CHILD; + } + else { + /* max_spare_threads value has never been checked, it's silently + * adjusted in event_open_logs() such that max_spare_threads >= + * min_spare_threads + threads_per_child. + */ + } return OK; } @@ -4839,7 +5008,7 @@ static const char *set_max_spare_threads(cmd_parms * cmd, void *dummy, } static const char *set_max_workers(cmd_parms * cmd, void *dummy, - const char *arg) + const char *arg, const char *arg2) { const char *err = ap_check_cmd_context(cmd, GLOBAL_ONLY); if (err != NULL) { @@ -4850,7 +5019,10 @@ static const char *set_max_workers(cmd_parms * cmd, void *dummy, "MaxClients is deprecated, use MaxRequestWorkers " "instead."); } + max_workers = atoi(arg); + auto_settings = (arg2 && !strcasecmp(arg2, "auto")); + return NULL; } @@ -4891,23 +5063,15 @@ static const char *set_thread_limit(cmd_parms * cmd, void *dummy, static const char *set_worker_factor(cmd_parms * cmd, void *dummy, const char *arg) { - double val; char *endptr; const char *err = ap_check_cmd_context(cmd, GLOBAL_ONLY); if (err != NULL) { return err; } - val = strtod(arg, &endptr); - if (*endptr) - return "error parsing value"; - - if (val <= 0) - return "AsyncRequestWorkerFactor argument must be a positive number"; - - worker_factor = val * WORKER_FACTOR_SCALE; - if (worker_factor < WORKER_FACTOR_SCALE) { - worker_factor = WORKER_FACTOR_SCALE; + async_factor = strtod(arg, &endptr); + if (*endptr || async_factor < 1.0) { + return "AsyncRequestWorkerFactor must be a rational number greater or equal to 1"; } return NULL; } @@ -4923,10 +5087,10 @@ static const command_rec event_cmds[] = { "Minimum number of idle threads, to handle request spikes"), AP_INIT_TAKE1("MaxSpareThreads", set_max_spare_threads, NULL, RSRC_CONF, "Maximum number of idle threads"), - AP_INIT_TAKE1("MaxClients", set_max_workers, NULL, RSRC_CONF, - "Deprecated name of MaxRequestWorkers"), - AP_INIT_TAKE1("MaxRequestWorkers", set_max_workers, NULL, RSRC_CONF, - "Maximum number of threads alive at the same time"), + AP_INIT_TAKE12("MaxClients", set_max_workers, NULL, RSRC_CONF, + "Deprecated name of MaxRequestWorkers"), + AP_INIT_TAKE12("MaxRequestWorkers", set_max_workers, NULL, RSRC_CONF, + "Maximum number of threads alive at the same time"), AP_INIT_TAKE1("ThreadsPerChild", set_threads_per_child, NULL, RSRC_CONF, "Number of threads each child creates"), AP_INIT_TAKE1("ThreadLimit", set_thread_limit, NULL, RSRC_CONF, From fb8839306b1eb6f8a8633989f2504d7b45696edb Mon Sep 17 00:00:00 2001 From: ylavic Date: Wed, 10 Jul 2024 15:10:50 +0200 Subject: [PATCH 18/22] mpm_event: Propose some new connections_above_limit() heuristics. --- server/mpm/event/event.c | 155 +++++++++++++++++++++++++++++++++++---- 1 file changed, 141 insertions(+), 14 deletions(-) diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index 3007dc8b33b..e0ba249bbf7 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -214,6 +214,9 @@ static int auto_settings = 0; /* Auto settings based on max_worker and num_online_cpus */ static int num_online_cpus = 0; /* Number of CPUs detected */ +static int workers_backlog_limit = 0; /* Max number of events in the workers' backlog + (above which not accepting new connections) */ + static /*atomic*/ apr_uint32_t dying = 0; static /*atomic*/ apr_uint32_t workers_may_exit = 0; static /*atomic*/ apr_uint32_t start_thread_may_exit = 0; @@ -824,23 +827,119 @@ static APR_INLINE int listensocks_disabled(void) return apr_atomic_read32(&listensocks_off) != 0; } -static APR_INLINE int connections_above_limit(int *busy) +/* Choose one of these */ +#define LIMIT_BY_CONNS_TOTAL_VS_IDLERS 0 +#define LIMIT_BY_BACKLOG_MAYBLOCK_VS_IDLERS 0 +#define LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_VS_IDLERS 1 /* the winner? */ +#define LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_AND_QUEUES_VS_IDLERS 0 + +#if LIMIT_BY_BACKLOG_MAYBLOCK_VS_IDLERS \ + || LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_VS_IDLERS \ + || LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_AND_QUEUES_VS_IDLERS +/* The rationale for backlog_nonblock_count is that only connections about + * to be processed outside the MPM can make a worker thread block, since we + * have no guarantee that modules won't block processing them. The core will + * not block processing TLS handshakes or reading the HTTP header for instance, + * but once the connections are passed to modules they may block in a handler + * reading the body or whatever. Those connections are in CONN_STATE_PROCESSING + * state in the backlog, which includes newly accepted connections and the ones + * waking up from CONN_STATE_KEEPALIVE and CONN_STATE_ASYNC_WAITIO. + * But the processing by/inside MPM event will never block, so fast enough + * eventually to consider the connections fully handled by the MPM differently + * in connnections_above_limit(), where backlog_nonblock_count can help. + */ +static /*atomic*/ apr_uint32_t backlog_nonblock_count; +#endif + +static APR_INLINE int connections_above_limit(void) { - apr_int32_t i_count = ap_queue_info_idlers_count(worker_queue_info); - if (i_count > 0) { - apr_uint32_t c_count = apr_atomic_read32(&connection_count); - apr_uint32_t l_count = apr_atomic_read32(linger_q->total); - if (c_count <= l_count - /* Off by 'listensocks_disabled()' to avoid flip flop */ - || c_count - l_count < (apr_uint32_t)threads_per_child + - (i_count - listensocks_disabled()) * - async_factor) { + /* Note that idlers >= 0 gives the number of idle workers, idlers < 0 gives + * the number of connections in the backlog waiting for an idle worker. + */ + int idlers = ap_queue_info_idlers_count(worker_queue_info); + +#if LIMIT_BY_CONNS_TOTAL_VS_IDLERS + + /* Limit reached when the number of connections (excluding the ones in + * lingering close) is above the number of idle workers. + */ + if (idlers >= 0) { + int conns = (apr_atomic_read32(&connection_count) - + apr_atomic_read32(linger_q->total)); + AP_DEBUG_ASSERT(conns >= 0); + if (idlers >= conns) { + return 0; + } + } + +#elif LIMIT_BY_BACKLOG_MAYBLOCK_VS_IDLERS + + /* Limit reached when the number of potentially blocking connections in + * the backlog is above the number of idle workers. + * + * Ignore connections in the backlog with "nonblocking" states by adding + * them back. + */ + idlers += apr_atomic_read32(&backlog_nonblock_count); + if (idlers >= 0) { + return 0; + } + +#elif LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_VS_IDLERS + + /* Limit reached when the number of potentially blocking connections in + * the backlog is above the number of idle workers, or the total number + * of connections waiting for a worker in the backlog is above some hard + * workers_backlog_limit. + */ + if (idlers >= -workers_backlog_limit) { + /* Ignore connections in the backlog with "nonblocking" states by + * adding them back. + */ + idlers += apr_atomic_read32(&backlog_nonblock_count); + if (idlers >= 0) { + return 0; + } + } + +#elif LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_AND_QUEUES_VS_IDLERS + + /* Limit reached when the number of potentially blocking connections in + * the backlog *and* the queues is above the number of idle workers, or + * the total number of connections waiting for a worker in the backlog + * is above some hard workers_backlog_limit. + */ + if (idlers >= -workers_backlog_limit) { + /* Ignore connections in the backlog with "nonblocking" states by + * adding them back. + */ + idlers += apr_atomic_read32(&backlog_nonblock_count); + if (idlers >= (apr_atomic_read32(keepalive_q->total) + + apr_atomic_read32(waitio_q->total))) { return 0; } } - else if (busy) { - *busy = 1; + +#else + + /* Legacy but w/o ignoring the keepalive_q (not shrinked anymore). + * Limit reached when the number of conns (besides lingering close ones) + * is above some unclear limit (the total number of workers plus the + * number of idle workers times the async factor..). + */ + int off = listensocks_disabled(); /* off by disabled() to limit flip flop */ + if (idlers >= off) { + int avail = (threads_per_child + (int)((idlers - off) * async_factor)); + int conns = (apr_atomic_read32(&connection_count) - + apr_atomic_read32(linger_q->total)); + AP_DEBUG_ASSERT(conns >= 0); + if (avail >= conns) { + return 0; + } } + +#endif + return 1; } @@ -848,7 +947,7 @@ static APR_INLINE int should_enable_listensocks(void) { return (listensocks_disabled() && !apr_atomic_read32(&dying) - && !connections_above_limit(NULL)); + && !connections_above_limit()); } static void close_socket_at(apr_socket_t *csd, @@ -1888,8 +1987,34 @@ static void conn_state_backlog_cb(void *baton, int pushed) if (pushed) { TO_QUEUE_APPEND(cs->sc->bl_q, cs); +#if LIMIT_BY_BACKLOG_MAYBLOCK_VS_IDLERS \ + || LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_VS_IDLERS \ + || LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_AND_QUEUES_VS_IDLERS + if (cs->pub.state != CONN_STATE_PROCESSING) { + /* These connections won't block when processed. + * + * Increment *after* TO_QUEUE_APPEND() to make sure that: + * cs->sc->bl_q->total >= backlog_nonblock_count + * always holds. + */ + apr_atomic_inc32(&backlog_nonblock_count); + } +#endif } else { /* popped */ +#if LIMIT_BY_BACKLOG_MAYBLOCK_VS_IDLERS \ + || LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_VS_IDLERS \ + || LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_AND_QUEUES_VS_IDLERS + if (cs->pub.state != CONN_STATE_PROCESSING) { + /* These connections won't block when processed. + * + * Decrement *before* TO_QUEUE_REMOVE() to make sure that: + * cs->sc->bl_q->total >= backlog_nonblock_count + * always holds. + */ + apr_atomic_dec32(&backlog_nonblock_count); + } +#endif TO_QUEUE_REMOVE(cs->sc->bl_q, cs); /* not in backlog anymore */ @@ -1932,7 +2057,7 @@ static void push2worker(event_conn_state_t *cs, timer_event_t *te, * the situation settles down. The listener and new idling workers will * test for should_enable_listensocks() to recover (when suitable). */ - if (connections_above_limit(NULL)) { + if (connections_above_limit()) { disable_listensocks(); if (above_limit) { *above_limit = 1; @@ -4525,6 +4650,8 @@ static int event_open_logs(apr_pool_t * p, apr_pool_t * plog, max_spare_threads = max_workers; } + workers_backlog_limit = threads_per_child * async_factor; + return OK; } From 94baa05601f6c0ea936ae42c18f2acd923859091 Mon Sep 17 00:00:00 2001 From: ylavic Date: Mon, 8 Jul 2024 19:19:22 +0200 Subject: [PATCH 19/22] mod_ssl: Nonblocking/async handshakes in CONN_STATE_PROCESSING phase. If AP_MPMQ_CAN_WAITIO, make mod_ssl perform non blocking TLS handshakes and go async when it would block. --- changes-entries/mod_ssl_async_handshakes.txt | 1 + modules/ssl/mod_ssl.c | 66 ++++++++++++++++---- modules/ssl/ssl_engine_io.c | 59 ++++++++++++++--- modules/ssl/ssl_private.h | 7 +++ 4 files changed, 112 insertions(+), 21 deletions(-) create mode 100644 changes-entries/mod_ssl_async_handshakes.txt diff --git a/changes-entries/mod_ssl_async_handshakes.txt b/changes-entries/mod_ssl_async_handshakes.txt new file mode 100644 index 00000000000..e19eeb629de --- /dev/null +++ b/changes-entries/mod_ssl_async_handshakes.txt @@ -0,0 +1 @@ + *) mod_ssl: Perform non blocking and async TLS handshakes. [Graham Leggett] diff --git a/modules/ssl/mod_ssl.c b/modules/ssl/mod_ssl.c index 420ae6b79ac..5cae44a64a8 100644 --- a/modules/ssl/mod_ssl.c +++ b/modules/ssl/mod_ssl.c @@ -29,6 +29,7 @@ #include "util_md5.h" #include "util_mutex.h" #include "ap_provider.h" +#include "ap_mpm.h" #include "http_config.h" #include "mod_proxy.h" /* for proxy_hook_section_post_config() */ @@ -40,6 +41,8 @@ int ssl_running_on_valgrind = 0; #endif +static int mpm_can_waitio = 0; + #if HAVE_OPENSSL_INIT_SSL || (OPENSSL_VERSION_NUMBER >= 0x10100000L && \ !defined(LIBRESSL_VERSION_NUMBER)) /* Openssl v1.1+ handles all termination automatically from @@ -464,6 +467,16 @@ static int ssl_hook_pre_config(apr_pool_t *pconf, return OK; } +static int ssl_hook_post_config(apr_pool_t *pconf, apr_pool_t *plog, + apr_pool_t *ptemp, server_rec *s) +{ + if (ap_mpm_query(AP_MPMQ_CAN_WAITIO, &mpm_can_waitio) != APR_SUCCESS) { + mpm_can_waitio = 0; + } + + return OK; +} + static SSLConnRec *ssl_init_connection_ctx(conn_rec *c, ap_conf_vector_t *per_dir_config, int reinit) @@ -692,8 +705,9 @@ static int ssl_hook_pre_connection(conn_rec *c, void *csd) static int ssl_hook_process_connection(conn_rec* c) { SSLConnRec *sslconn = myConnConfig(c); + int status = DECLINED; - if (sslconn && !sslconn->disabled) { + if (sslconn && !sslconn->disabled && !sslconn->initialized) { /* On an active SSL connection, let the input filters initialize * themselves which triggers the handshake, which again triggers * all kinds of useful things such as SNI and ALPN. @@ -701,23 +715,50 @@ static int ssl_hook_process_connection(conn_rec* c) apr_bucket_brigade* temp; apr_status_t rv; - temp = apr_brigade_create(c->pool, c->bucket_alloc); - rv = ap_get_brigade(c->input_filters, temp, - AP_MODE_INIT, APR_BLOCK_READ, 0); - apr_brigade_destroy(temp); - - if (APR_SUCCESS != APR_SUCCESS) { + temp = ap_acquire_brigade(c); + rv = ap_get_brigade(c->input_filters, temp, AP_MODE_INIT, + mpm_can_waitio ? APR_NONBLOCK_READ : APR_BLOCK_READ, + 0); + ap_release_brigade(c, temp); + + if (rv == APR_SUCCESS) { + /* great news, lets continue */ + ap_log_cerror(APLOG_MARK, APLOG_DEBUG, 0, c, APLOGNO(10370) + "SSL handshake completed, continuing"); + sslconn->initialized = 1; + } + else if (rv == MODSSL_ERROR_HTTP_ON_HTTPS) { + /* Plain HTTP spoken on https port, mod_ssl wants to be called + * without AP_MODE_INIT. + */ + ap_log_cerror(APLOG_MARK, APLOG_DEBUG, 0, c, APLOGNO(10371) + "SSL handshake with plain HTTP, continuing"); + sslconn->initialized = 1; + } + else if (mpm_can_waitio && APR_STATUS_IS_EAGAIN(rv)) { + /* Take advantage of an async MPM. If we see an EAGAIN, + * loop round and don't block. + */ + ap_log_cerror(APLOG_MARK, APLOG_DEBUG, 0, c, APLOGNO(10372) + "SSL handshake in progress, try again later"); if (c->cs) { - c->cs->state = CONN_STATE_LINGER; + c->cs->state = CONN_STATE_ASYNC_WAITIO; } - ap_log_cerror(APLOG_MARK, APLOG_ERR, rv, c, APLOGNO(10373) + status = OK; + } + else { + /* we failed, give up */ + ap_log_cerror(APLOG_MARK, APLOG_INFO, rv, c, APLOGNO(10373) "SSL handshake was not completed, " "closing connection"); - return OK; + if (c->cs) { + c->cs->state = CONN_STATE_LINGER; + } + status = OK; } } - - return DECLINED; + + return status; } /* @@ -746,6 +787,7 @@ static void ssl_register_hooks(apr_pool_t *p) ap_hook_http_scheme (ssl_hook_http_scheme, NULL,NULL, APR_HOOK_MIDDLE); ap_hook_default_port (ssl_hook_default_port, NULL,NULL, APR_HOOK_MIDDLE); ap_hook_pre_config (ssl_hook_pre_config, NULL,NULL, APR_HOOK_MIDDLE); + ap_hook_post_config (ssl_hook_post_config, NULL,NULL, APR_HOOK_MIDDLE); ap_hook_child_init (ssl_init_Child, NULL,NULL, APR_HOOK_MIDDLE); ap_hook_post_read_request(ssl_hook_ReadReq, pre_prr,NULL, APR_HOOK_MIDDLE); ap_hook_check_access (ssl_hook_Access, NULL,NULL, APR_HOOK_MIDDLE, diff --git a/modules/ssl/ssl_engine_io.c b/modules/ssl/ssl_engine_io.c index 3a2e841ae02..06ebeac2247 100644 --- a/modules/ssl/ssl_engine_io.c +++ b/modules/ssl/ssl_engine_io.c @@ -292,6 +292,7 @@ typedef struct { } char_buffer_t; typedef struct { + conn_rec *c; SSL *ssl; BIO *bio_out; ap_filter_t *f; @@ -730,6 +731,32 @@ static apr_status_t ssl_io_input_read(bio_filter_in_ctx_t *inctx, * (This is usually the case when the client forces an SSL * renegotiation which is handled implicitly by OpenSSL.) */ + if (inctx->c->cs) { + inctx->c->cs->sense = CONN_SENSE_WANT_READ; + } + inctx->rc = APR_EAGAIN; + + if (*len > 0) { + inctx->rc = APR_SUCCESS; + break; + } + if (inctx->block == APR_NONBLOCK_READ) { + break; + } + continue; /* Blocking and nothing yet? Try again. */ + } + if (ssl_err == SSL_ERROR_WANT_WRITE) { + /* + * If OpenSSL wants to write during read, and we were + * nonblocking, report as an EAGAIN. Otherwise loop, + * pulling more data from network filter. + * + * (This is usually the case when the client forces an SSL + * renegotiation which is handled implicitly by OpenSSL.) + */ + if (inctx->c->cs) { + inctx->c->cs->sense = CONN_SENSE_WANT_WRITE; + } inctx->rc = APR_EAGAIN; if (*len > 0) { @@ -895,7 +922,9 @@ static apr_status_t ssl_filter_write(ap_filter_t *f, * (This is usually the case when the client forces an SSL * renegotiation which is handled implicitly by OpenSSL.) */ - outctx->c->cs->sense = CONN_SENSE_WANT_READ; + if (outctx->c->cs) { + outctx->c->cs->sense = CONN_SENSE_WANT_READ; + } outctx->rc = APR_EAGAIN; ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, outctx->c, "Want read during nonblocking write"); @@ -950,10 +979,6 @@ static apr_status_t ssl_filter_write(ap_filter_t *f, sizeof(HTTP_ON_HTTPS_PORT) - 1, \ alloc) -/* Custom apr_status_t error code, used when a plain HTTP request is - * received on an SSL port. */ -#define MODSSL_ERROR_HTTP_ON_HTTPS (APR_OS_START_USERERR + 0) - /* Custom apr_status_t error code, used when the proxy cannot * establish an outgoing SSL connection. */ #define MODSSL_ERROR_BAD_GATEWAY (APR_OS_START_USERERR + 1) @@ -989,7 +1014,7 @@ static apr_status_t ssl_io_filter_error(bio_filter_in_ctx_t *inctx, f->c->keepalive = AP_CONN_CLOSE; if (is_init) { sslconn->non_ssl_request = NON_SSL_SEND_REQLINE; - return AP_FILTER_ERROR; + return MODSSL_ERROR_HTTP_ON_HTTPS; } sslconn->non_ssl_request = NON_SSL_SEND_HDR_SEP; @@ -1424,10 +1449,25 @@ static apr_status_t ssl_io_filter_handshake(ssl_filter_ctx_t *filter_ctx) } else if (ssl_err == SSL_ERROR_WANT_READ) { /* - * This is in addition to what was present earlier. It is - * borrowed from openssl_state_machine.c [mod_tls]. - * TBD. + * Call us back when ready to read *\/ */ + ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, outctx->c, + "Want read during nonblocking accept"); + if (outctx->c->cs) { + outctx->c->cs->sense = CONN_SENSE_WANT_READ; + } + outctx->rc = APR_EAGAIN; + return APR_EAGAIN; + } + else if (ssl_err == SSL_ERROR_WANT_WRITE) { + /* + * Call us back when ready to write *\/ + */ + ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, outctx->c, + "Want write during nonblocking accept"); + if (outctx->c->cs) { + outctx->c->cs->sense = CONN_SENSE_WANT_WRITE; + } outctx->rc = APR_EAGAIN; return APR_EAGAIN; } @@ -2230,6 +2270,7 @@ static apr_status_t ssl_io_input_add_filter(ssl_filter_ctx_t *filter_ctx, conn_r } BIO_set_data(filter_ctx->pbioRead, (void *)inctx); + inctx->c = c; inctx->ssl = ssl; inctx->bio_out = filter_ctx->pbioWrite; inctx->f = filter_ctx->pInputFilter; diff --git a/modules/ssl/ssl_private.h b/modules/ssl/ssl_private.h index 2f7bb51fa5a..dc2f4f0d98b 100644 --- a/modules/ssl/ssl_private.h +++ b/modules/ssl/ssl_private.h @@ -367,6 +367,12 @@ APLOG_USE_MODULE(ssl); #define mySrvConfigFromConn(c) mySrvConfig(mySrvFromConn(c)) #define myModConfigFromConn(c) myModConfig(mySrvFromConn(c)) +/** + * Custom apr_status_t error code, used when a plain HTTP request is + * received on an SSL port. + */ +#define MODSSL_ERROR_HTTP_ON_HTTPS (APR_OS_START_USERERR + 0) + /** * Defaults for the configuration */ @@ -582,6 +588,7 @@ typedef struct { const char *verify_info; const char *verify_error; int verify_depth; + int initialized; int disabled; enum { NON_SSL_OK = 0, /* is SSL request, or error handling completed */ From 6cbda1f1fa81fc7fdfdeeb8e45619978c1f8950f Mon Sep 17 00:00:00 2001 From: ylavic Date: Tue, 9 Jul 2024 11:37:58 +0200 Subject: [PATCH 20/22] core,http: Non blocking HTTP header read. --- include/http_protocol.h | 75 +++- include/httpd.h | 3 + include/mod_core.h | 8 +- modules/http/http_core.c | 43 +- modules/http/http_filters.c | 158 ++++--- modules/http2/h2_stream.c | 2 +- modules/proxy/mod_proxy_http.c | 6 +- server/core.c | 2 +- server/core_filters.c | 71 ++-- server/protocol.c | 753 +++++++++++++++++---------------- 10 files changed, 647 insertions(+), 474 deletions(-) diff --git a/include/http_protocol.h b/include/http_protocol.h index 2b509b341fe..0290abef450 100644 --- a/include/http_protocol.h +++ b/include/http_protocol.h @@ -54,19 +54,30 @@ AP_DECLARE_DATA extern ap_filter_rec_t *ap_old_write_func; */ /** - * Read an empty request and set reasonable defaults. + * Create an empty request and set reasonable defaults. * @param c The current connection * @return The new request_rec */ AP_DECLARE(request_rec *) ap_create_request(conn_rec *c); /** - * Read a request and fill in the fields. + * Read the request line and header fields. * @param c The current connection * @return The new request_rec */ AP_DECLARE(request_rec *) ap_read_request(conn_rec *c); +/** + * Read the request line and header fields, possibly non-blocking. + * @param r The request read + * @param c The connection to read from + * @param block How the read should be performed + * ::APR_BLOCK_READ, ::APR_NONBLOCK_READ + * @return APR_SUCCESS, APR_EAGAIN or APR_EGENERAL + */ +AP_DECLARE(apr_status_t) ap_read_request_ex(request_rec **r, conn_rec *c, + apr_read_type_e block); + /** * Assign the method, uri and protocol (in HTTP/1.x the * items from the first line) to the request. @@ -107,6 +118,12 @@ AP_DECLARE(int) ap_parse_request_line(request_rec *r); */ AP_DECLARE(int) ap_check_request_header(request_rec *r); +/** + * Reentrant state for ap_fgetline_ex() and ap_get_mime_headers_ex() + */ +struct ap_getline_state; /* opaque */ +typedef struct ap_getline_state ap_getline_state_t; + /** * Read the mime-encoded headers. * @param r The current request @@ -122,6 +139,23 @@ AP_DECLARE(void) ap_get_mime_headers(request_rec *r); AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb); +/** + * Reentrant version of ap_get_mime_headers() reading from an input + * filter in blocking or non-blocking mode. + * @param r The current request + * @param f Input filter to read from + * @param block How the operations should be performed + * ::APR_BLOCK_READ, ::APR_NONBLOCK_READ + * @param bb temp brigade + * @param state_p State of the parsing, must point to NULL on first call + * and points to NULL on output if APR_EAGAIN is not returned + */ +AP_DECLARE(apr_status_t) ap_get_mime_headers_ex(request_rec *r, + ap_filter_t *f, + apr_read_type_e block, + apr_bucket_brigade *bb, + ap_getline_state_t **state_p); + /** * Run post_read_request hook and validate. * @param r The current request @@ -744,11 +778,13 @@ AP_DECLARE(apr_status_t) ap_get_basic_auth_components(const request_rec *r, */ AP_CORE_DECLARE(void) ap_parse_uri(request_rec *r, const char *uri); -#define AP_GETLINE_FOLD (1 << 0) /* Whether to merge continuation lines */ -#define AP_GETLINE_CRLF (1 << 1) /* Whether line ends must be CRLF */ -#define AP_GETLINE_NOSPC_EOL (1 << 2) /* Whether to consume up to and including - the end of line on APR_ENOSPC */ -#define AP_GETLINE_NONBLOCK (1 << 3) /* Whether to read non-blocking */ +#define AP_GETLINE_FOLD (1 << 0) /* Whether to merge continuation lines */ +#define AP_GETLINE_CRLF (1 << 1) /* Whether line ends must be CRLF */ +#define AP_GETLINE_NOSPC_EOL (1 << 2) /* Whether to consume up to and including + the end of line on APR_ENOSPC */ +#define AP_GETLINE_NONBLOCK (1 << 3) /* Whether to read non-blocking */ +#define AP_GETLINE_ALLOC (1 << 4) /* Whether to allocate the returned line */ +#define AP_GETLINE_FOLD_COL (1 << 5 | AP_GETLINE_FOLD) /* Fold after colon only */ /** * Get the next line of input for the request @@ -783,6 +819,31 @@ AP_DECLARE(apr_status_t) ap_fgetline(char **s, apr_size_t n, int flags, apr_bucket_brigade *bb, apr_pool_t *p); +/** + * Get the next line from an input filter, reentrant (e.g. EAGAIN). + * + * @param s Pointer to the pointer to the buffer into which the line + * should be read; if *s==NULL, a buffer of the necessary size + * to hold the data will be allocated from \p p + * @param n The size of the buffer + * @param read The length of the line. + * @param f Input filter to read from + * @param flags Bit mask of AP_GETLINE_* options + * @param bb Working brigade to use when reading buckets + * @param state_p State of the parsing, must point to NULL on first call + * and points to NULL on output if APR_EAGAIN is not returned + * @param p The pool to allocate the buffer from (if needed) + * @return APR_SUCCESS, if successful + * APR_ENOSPC, if the line is too big to fit in the buffer + * APR_EAGAIN, if non-blocking IO would block + * Other errors where appropriate + */ +AP_DECLARE(apr_status_t) ap_fgetline_ex(char **s, apr_size_t n, + apr_size_t *read, ap_filter_t *f, + int flags, apr_bucket_brigade *bb, + ap_getline_state_t **state_p, + apr_pool_t *p); + /** * @see ap_fgetline * diff --git a/include/httpd.h b/include/httpd.h index c3f72fceb7e..ae08740b227 100644 --- a/include/httpd.h +++ b/include/httpd.h @@ -1315,6 +1315,9 @@ struct conn_rec { int async_filter; int outgoing; + + /** Partial request being read (non-blocking) */ + request_rec *partial_request; }; struct conn_slave_rec { diff --git a/include/mod_core.h b/include/mod_core.h index f9cc0611f4c..b4a40de2d5d 100644 --- a/include/mod_core.h +++ b/include/mod_core.h @@ -41,7 +41,7 @@ extern "C" { /* Handles for core filters */ AP_DECLARE_DATA extern ap_filter_rec_t *ap_http_input_filter_handle; -AP_DECLARE_DATA extern ap_filter_rec_t *ap_h1_request_in_filter_handle; +AP_DECLARE_DATA extern ap_filter_rec_t *ap_h1_header_in_filter_handle; AP_DECLARE_DATA extern ap_filter_rec_t *ap_h1_body_in_filter_handle; AP_DECLARE_DATA extern ap_filter_rec_t *ap_http_header_filter_handle; AP_DECLARE_DATA extern ap_filter_rec_t *ap_chunk_filter_handle; @@ -55,9 +55,9 @@ apr_status_t ap_http_filter(ap_filter_t *f, apr_bucket_brigade *b, ap_input_mode_t mode, apr_read_type_e block, apr_off_t readbytes); -apr_status_t ap_h1_request_in_filter(ap_filter_t *f, apr_bucket_brigade *bb, - ap_input_mode_t mode, apr_read_type_e block, - apr_off_t readbytes); +apr_status_t ap_h1_header_in_filter(ap_filter_t *f, apr_bucket_brigade *bb, + ap_input_mode_t mode, apr_read_type_e block, + apr_off_t readbytes); apr_status_t ap_h1_body_in_filter(ap_filter_t *f, apr_bucket_brigade *b, ap_input_mode_t mode, apr_read_type_e block, diff --git a/modules/http/http_core.c b/modules/http/http_core.c index 85858ab2b57..7e9f82f87dd 100644 --- a/modules/http/http_core.c +++ b/modules/http/http_core.c @@ -37,7 +37,7 @@ /* Handles for core filters */ AP_DECLARE_DATA ap_filter_rec_t *ap_http_input_filter_handle; -AP_DECLARE_DATA ap_filter_rec_t *ap_h1_request_in_filter_handle; +AP_DECLARE_DATA ap_filter_rec_t *ap_h1_header_in_filter_handle; AP_DECLARE_DATA ap_filter_rec_t *ap_h1_body_in_filter_handle; AP_DECLARE_DATA ap_filter_rec_t *ap_http_header_filter_handle; AP_DECLARE_DATA ap_filter_rec_t *ap_h1_response_out_filter_handle; @@ -50,7 +50,8 @@ AP_DECLARE_DATA const char *ap_multipart_boundary; /* If we are using an MPM That Supports Async Connections, * use a different processing function */ -static int async_mpm = 0; +static int mpm_is_async = 0; +static int mpm_can_waitio = 0; static const char *set_keep_alive_timeout(cmd_parms *cmd, void *dummy, const char *arg) @@ -145,18 +146,34 @@ static int ap_process_http_async_connection(conn_rec *c) AP_DEBUG_ASSERT(cs->state == CONN_STATE_PROCESSING); if (cs->state == CONN_STATE_PROCESSING) { + apr_read_type_e block = APR_BLOCK_READ; + apr_status_t rv; + + /* slave connections (i.e. h2_c2) not ready for WAITIO yet */ + if (mpm_can_waitio && !c->master) { + block = APR_NONBLOCK_READ; + } + ap_update_child_status_from_conn(c->sbh, SERVER_BUSY_READ, c); if (ap_extended_status) { ap_set_conn_count(c->sbh, r, c->keepalives); } - if ((r = ap_read_request(c))) { + + rv = ap_read_request_ex(&r, c, block); + if (APR_STATUS_IS_EAGAIN(rv)) { + cs->state = CONN_STATE_ASYNC_WAITIO; + return OK; + } + if (rv == APR_SUCCESS) { if (r->status == HTTP_OK) { cs->state = CONN_STATE_HANDLER; + if (ap_extended_status) { ap_set_conn_count(c->sbh, r, c->keepalives + 1); } ap_update_child_status(c->sbh, SERVER_BUSY_WRITE, r); ap_process_async_request(r); + /* After the call to ap_process_request, the * request pool may have been deleted. We set * r=NULL here to ensure that any dereference @@ -168,7 +185,8 @@ static int ap_process_http_async_connection(conn_rec *c) } if (cs->state != CONN_STATE_WRITE_COMPLETION && - cs->state != CONN_STATE_SUSPENDED) { + cs->state != CONN_STATE_SUSPENDED && + cs->state != CONN_STATE_LINGER) { /* Something went wrong; close the connection */ cs->state = CONN_STATE_LINGER; } @@ -246,7 +264,7 @@ static int ap_process_http_sync_connection(conn_rec *c) static int ap_process_http_connection(conn_rec *c) { - if (async_mpm && !c->clogging_input_filters) { + if (mpm_is_async && !c->clogging_input_filters) { return ap_process_http_async_connection(c); } else { @@ -276,7 +294,7 @@ static void h1_pre_read_request(request_rec *r, conn_rec *c) if (!r->main && !r->prev && !strcmp(AP_PROTOCOL_HTTP1, ap_get_protocol(c))) { if (r->proxyreq == PROXYREQ_NONE) { - ap_add_input_filter_handle(ap_h1_request_in_filter_handle, + ap_add_input_filter_handle(ap_h1_header_in_filter_handle, NULL, r, r->connection); } ap_add_output_filter_handle(ap_h1_response_out_filter_handle, @@ -343,9 +361,14 @@ static int http_send_options(request_rec *r) static int http_post_config(apr_pool_t *p, apr_pool_t *plog, apr_pool_t *ptemp, server_rec *s) { apr_uint64_t val; - if (ap_mpm_query(AP_MPMQ_IS_ASYNC, &async_mpm) != APR_SUCCESS) { - async_mpm = 0; + + if (ap_mpm_query(AP_MPMQ_IS_ASYNC, &mpm_is_async) != APR_SUCCESS) { + mpm_is_async = 0; } + if (ap_mpm_query(AP_MPMQ_CAN_WAITIO, &mpm_can_waitio) != APR_SUCCESS) { + mpm_can_waitio = 0; + } + ap_random_insecure_bytes(&val, sizeof(val)); ap_multipart_boundary = apr_psprintf(p, "%0" APR_UINT64_T_HEX_FMT, val); @@ -369,8 +392,8 @@ static void register_hooks(apr_pool_t *p) ap_http_input_filter_handle = ap_register_input_filter("HTTP_IN", ap_http_filter, NULL, AP_FTYPE_PROTOCOL); - ap_h1_request_in_filter_handle = - ap_register_input_filter("HTTP1_REQUEST_IN", ap_h1_request_in_filter, + ap_h1_header_in_filter_handle = + ap_register_input_filter("HTTP1_HEADER_IN", ap_h1_header_in_filter, NULL, AP_FTYPE_PROTOCOL); ap_h1_body_in_filter_handle = ap_register_input_filter("HTTP1_BODY_IN", ap_h1_body_in_filter, diff --git a/modules/http/http_filters.c b/modules/http/http_filters.c index 426fe2fcb97..d7667c8c361 100644 --- a/modules/http/http_filters.c +++ b/modules/http/http_filters.c @@ -264,9 +264,10 @@ static apr_status_t read_chunked_trailers(http_ctx_t *ctx, ap_filter_t *f, apr_bucket *e; request_rec *r = f->r; apr_table_t *trailers; - apr_table_t *saved_headers_in = r->headers_in; + apr_table_t *saved_headers_in; int saved_status = r->status; + saved_headers_in = r->headers_in; trailers = apr_table_make(r->pool, 5); r->status = HTTP_OK; r->headers_in = trailers; @@ -2174,18 +2175,34 @@ typedef struct h1_request_ctx { const char *method; const char *uri; const char *protocol; + + /* parsing context */ + ap_getline_state_t *getline_state; + apr_bucket_brigade *tmp_bb; + int num_blank_lines; } h1_request_ctx; -static apr_status_t read_request_line(h1_request_ctx *ctx, apr_bucket_brigade *bb) +static apr_status_t read_request_line(h1_request_ctx *ctx, + ap_filter_t *f, apr_read_type_e block, + apr_bucket_brigade *bb) { - apr_size_t len; - int num_blank_lines = DEFAULT_LIMIT_BLANK_LINES; - core_server_config *conf = ap_get_core_module_config(ctx->r->server->module_config); + request_rec *r = ctx->r; + apr_size_t max_size = r->server->limit_req_line + 2 + 1; /* + CRLF + \0 */ + core_server_config *conf = ap_get_core_module_config(r->server->module_config); int strict = (conf->http_conformance != AP_HTTP_CONFORMANCE_UNSAFE); + int flags = AP_GETLINE_ALLOC; apr_status_t rv; + if (strict) { + flags |= AP_GETLINE_CRLF; + } + if (block == APR_NONBLOCK_READ) { + flags |= AP_GETLINE_NONBLOCK; + } + /* Read past empty lines until we get a real request line, * a read error, the connection closes (EOF), or we timeout. + * Reentrance on EAGAIN is handled in/by ctx->getline_state. * * We skip empty lines because browsers have to tack a CRLF on to the end * of POSTs to support old CERN webservers. But note that we may not @@ -2199,52 +2216,35 @@ static apr_status_t read_request_line(h1_request_ctx *ctx, apr_bucket_brigade *b * have to block during a read. */ do { - /* ensure ap_rgetline allocates memory each time thru the loop - * if there are empty lines - */ - ctx->request_line = NULL; - len = 0; - rv = ap_rgetline(&ctx->request_line, (apr_size_t)(ctx->r->server->limit_req_line + 2), - &len, ctx->r, strict ? AP_GETLINE_CRLF : 0, bb); + apr_size_t len = 0; + /* allocates memory each time thru the loop */ + rv = ap_fgetline_ex(&ctx->request_line, max_size, &len, f, flags, + bb, &ctx->getline_state, r->pool); if (rv != APR_SUCCESS) { return rv; } - else if (len > 0) { - /* got the line in ctx->r->the_request */ + if (len > 0) { + /* got full line */ return APR_SUCCESS; } - } while (--num_blank_lines >= 0); + } while (--ctx->num_blank_lines >= 0); + /* too many blank lines */ return APR_EINVAL; } -static void sanitize_brigade(apr_bucket_brigade *bb) -{ - apr_bucket *e, *next; - - for (e = APR_BRIGADE_FIRST(bb); - e != APR_BRIGADE_SENTINEL(bb); - e = next) - { - next = APR_BUCKET_NEXT(e); - if (!APR_BUCKET_IS_METADATA(e) && e->length == 0) { - apr_bucket_delete(e); - } - } -} - -apr_status_t ap_h1_request_in_filter(ap_filter_t *f, - apr_bucket_brigade *bb, - ap_input_mode_t mode, - apr_read_type_e block, - apr_off_t readbytes) +apr_status_t ap_h1_header_in_filter(ap_filter_t *f, + apr_bucket_brigade *bb, + ap_input_mode_t mode, + apr_read_type_e block, + apr_off_t readbytes) { request_rec *r = f->r; - apr_bucket *e; h1_request_ctx *ctx = f->ctx; apr_status_t rv = APR_SUCCESS; int http_status = HTTP_OK; + apr_bucket *e; /* just get out of the way for things we don't want to handle. */ if (mode != AP_MODE_READBYTES && mode != AP_MODE_GETLINE) { @@ -2255,15 +2255,23 @@ apr_status_t ap_h1_request_in_filter(ap_filter_t *f, f->ctx = ctx = apr_pcalloc(r->pool, sizeof(*ctx)); ctx->r = r; ctx->state = REQ_LINE; + ctx->num_blank_lines = DEFAULT_LIMIT_BLANK_LINES; + ctx->tmp_bb = apr_brigade_create(r->pool, r->connection->bucket_alloc); } - /* This filter needs to get out of the way of read_request_line() */ - ap_remove_input_filter(f); - - while (APR_SUCCESS == rv) { + for (;;) { switch (ctx->state) { case REQ_LINE: - if ((rv = read_request_line(ctx, bb)) != APR_SUCCESS) { + rv = read_request_line(ctx, f->next, block, ctx->tmp_bb); + apr_brigade_cleanup(ctx->tmp_bb); + + if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) { + ap_log_rerror(APLOG_MARK, APLOG_TRACE6, rv, r, + "reading request line"); + rv = APR_EAGAIN; + goto cleanup; + } + if (rv != APR_SUCCESS) { /* certain failures are answered with a HTTP error bucket * and are terminal for parsing a request */ ctx->method = ctx->uri = "-"; @@ -2280,60 +2288,76 @@ apr_status_t ap_h1_request_in_filter(ap_filter_t *f, else if (APR_STATUS_IS_EINVAL(rv)) { http_status = HTTP_BAD_REQUEST; } + ap_log_rerror(APLOG_MARK, APLOG_TRACE1, rv, r, + "failed reading request line (status %d)", + http_status != HTTP_OK ? http_status : -1); goto cleanup; } if (!ap_h1_tokenize_request_line(r, ctx->request_line, - &ctx->method, &ctx->uri, &ctx->protocol)) { + &ctx->method, &ctx->uri, + &ctx->protocol)) { http_status = HTTP_BAD_REQUEST; + ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, r, + "failed tokenizing request line, " + "returning error bucket %d", + http_status); goto cleanup; } + /* got the request line and it looked to contain what we need */ ctx->state = REQ_HEADERS; break; case REQ_HEADERS: - ap_get_mime_headers_core(r, bb); - if (r->status != HTTP_OK) { - ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(00567) - "request failed: error reading the headers"); - http_status = r->status; + rv = ap_get_mime_headers_ex(r, f->next, block, ctx->tmp_bb, + &ctx->getline_state); + apr_brigade_cleanup(ctx->tmp_bb); + + if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) { + ap_log_rerror(APLOG_MARK, APLOG_TRACE6, rv, r, + "reading request headers"); + goto cleanup; + } + if (rv != APR_SUCCESS || r->status != HTTP_OK) { + http_status = (r->status == HTTP_OK + ? HTTP_INTERNAL_SERVER_ERROR + : r->status); + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, rv, r, APLOGNO(00567) + "request failed: error reading the headers (%i)", + http_status); + r->status = HTTP_OK; goto cleanup; } - /* clear the brigade, as ap_get_mime_headers_core() leaves the last - * empty line in there, insert the REQUEST bucket and return */ - apr_brigade_cleanup(bb); + e = ap_bucket_request_createn(ctx->method, ctx->uri, ctx->protocol, r->headers_in, r->pool, r->connection->bucket_alloc); - /* reading may leave 0 length data buckets in the brigade, - * get rid of those. */ - sanitize_brigade(bb); - APR_BRIGADE_INSERT_HEAD(bb, e); - ctx->state = REQ_BODY; - ap_log_rerror(APLOG_MARK, APLOG_TRACE1, rv, r, + APR_BRIGADE_INSERT_TAIL(bb, e); + + ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, r, "http1 request and headers parsed: %s %s %s", ctx->method, ctx->uri, ctx->protocol); - goto cleanup; - - case REQ_BODY: - /* we should not come here */ - AP_DEBUG_ASSERT(0); - rv = ap_get_brigade(f->next, bb, mode, block, readbytes); + /* Got the header, done with this filter */ + ap_remove_input_filter(f); + ctx->state = REQ_BODY; goto cleanup; case REQ_ERROR: - default: + ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, r, + "invalid request read while in error"); rv = APR_EINVAL; goto cleanup; + + default: + /* we should never come here */ + ap_assert(0); + break; } - } /* while(APR_SUCCESS == rv) */ + } cleanup: if (http_status != HTTP_OK) { - ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, r, - "failed reading request line, returning error bucket %d", http_status); - apr_brigade_cleanup(bb); e = ap_bucket_error_create(http_status, NULL, r->pool, f->c->bucket_alloc); APR_BRIGADE_INSERT_TAIL(bb, e); diff --git a/modules/http2/h2_stream.c b/modules/http2/h2_stream.c index ee87555f9f3..b050b4d962c 100644 --- a/modules/http2/h2_stream.c +++ b/modules/http2/h2_stream.c @@ -755,7 +755,7 @@ apr_status_t h2_stream_add_header(h2_stream *stream, } if (session->s->limit_req_fields > 0 - && stream->request_headers_added > session->s->limit_req_fields) { + && stream->request_headers_added >= session->s->limit_req_fields) { /* already over limit, count this attempt, but do not take it in */ ++stream->request_headers_added; } diff --git a/modules/proxy/mod_proxy_http.c b/modules/proxy/mod_proxy_http.c index bfeee868558..38da5b0f7f6 100644 --- a/modules/proxy/mod_proxy_http.c +++ b/modules/proxy/mod_proxy_http.c @@ -888,10 +888,8 @@ static apr_status_t ap_proxy_read_headers(request_rec *r, request_rec *rr, tmp_bb = apr_brigade_create(r->pool, c->bucket_alloc); while (1) { - rc = ap_proxygetline(tmp_bb, buffer, size, rr, - AP_GETLINE_FOLD | AP_GETLINE_NOSPC_EOL, &len); - - + const int flags = AP_GETLINE_FOLD_COL; + rc = ap_proxygetline(tmp_bb, buffer, size, rr, flags, &len); if (rc != APR_SUCCESS) { if (APR_STATUS_IS_ENOSPC(rc)) { int trunc = (len > 128 ? 128 : len) / 2; diff --git a/server/core.c b/server/core.c index 4d5d569d93b..632af394d8f 100644 --- a/server/core.c +++ b/server/core.c @@ -5551,7 +5551,7 @@ static conn_rec *core_create_conn(apr_pool_t *ptrans, server_rec *s, c->id = id; c->bucket_alloc = alloc; c->async_filter = sconf->async_filter; - + c->keepalive = AP_CONN_UNKNOWN; c->clogging_input_filters = 0; if (sconf->conn_log_level) { diff --git a/server/core_filters.c b/server/core_filters.c index 0887603b9ab..2dbc5afbb83 100644 --- a/server/core_filters.c +++ b/server/core_filters.c @@ -142,13 +142,18 @@ apr_status_t ap_core_input_filter(ap_filter_t *f, apr_bucket_brigade *b, if (mode == AP_MODE_GETLINE) { /* we are reading a single LF line, e.g. the HTTP headers */ rv = apr_brigade_split_line(b, ctx->bb, block, HUGE_STRING_LEN); - /* We should treat EAGAIN here the same as we do for EOF (brigade is - * empty). We do this by returning whatever we have read. This may - * or may not be bogus, but is consistent (for now) with EOF logic. + + /* To distinguish EAGAIN from EOS (for which apr_brigade_split_line() + * returns an empty brigade), return an empty brigade only for the + * former and APR_EOF for the latter. */ if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) { rv = APR_SUCCESS; } + else if (rv == APR_SUCCESS && APR_BRIGADE_EMPTY(b)) { + AP_DEBUG_ASSERT(APR_BRIGADE_EMPTY(ctx->bb)); + rv = APR_EOF; + } goto cleanup; } @@ -234,31 +239,43 @@ apr_status_t ap_core_input_filter(ap_filter_t *f, apr_bucket_brigade *b, AP_DEBUG_ASSERT(readbytes > 0); - e = APR_BRIGADE_FIRST(ctx->bb); - rv = apr_bucket_read(e, &str, &len, block); - if (rv != APR_SUCCESS) { - if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) { + do { + e = APR_BRIGADE_FIRST(ctx->bb); + rv = apr_bucket_read(e, &str, &len, block); + if (rv != APR_SUCCESS) { /* getting EAGAIN for a blocking read is an error; not for a - * non-blocking read, return an empty brigade. */ - rv = APR_SUCCESS; + * non-blocking read, return an empty brigade w/ APR_SUCCESS */ + if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) { + rv = APR_SUCCESS; + } + goto cleanup; } - goto cleanup; - } - else if (block == APR_BLOCK_READ && len == 0) { - /* We wanted to read some bytes in blocking mode. We read - * 0 bytes. Hence, we now assume we are EOS. - * - * When we are in normal mode, return an EOS bucket to the - * caller. - * When we are in speculative mode, leave ctx->bb empty, so - * that the next call returns an EOS bucket. - */ - apr_bucket_delete(e); + if (len > 0) { + break; + } + if (APR_BUCKET_IS_METADATA(e)) { + APR_BUCKET_REMOVE(e); + APR_BRIGADE_INSERT_TAIL(b, e); + } + else { + apr_bucket_delete(e); + } + } while (!APR_BRIGADE_EMPTY(ctx->bb)); - if (mode == AP_MODE_READBYTES) { + if (len == 0) { + /* We are at EOS. + * In normal blocking mode, return an EOS bucket. + * Otherwise it's not expected by the caller, so return APR_EOF + * directly. + */ + AP_DEBUG_ASSERT(APR_BRIGADE_EMPTY(ctx->bb)); + if (mode == AP_MODE_READBYTES && block == APR_BLOCK_READ) { e = apr_bucket_eos_create(c->bucket_alloc); APR_BRIGADE_INSERT_TAIL(b, e); } + else if (APR_BRIGADE_EMPTY(b)) { + rv = APR_EOF; + } goto cleanup; } @@ -266,7 +283,7 @@ apr_status_t ap_core_input_filter(ap_filter_t *f, apr_bucket_brigade *b, if (len < readbytes) { apr_size_t bucket_len; - /* We already registered the data in e in len */ + /* We already accounted for e in len */ e = APR_BUCKET_NEXT(e); while ((len < readbytes) && (rv == APR_SUCCESS) && (e != APR_BRIGADE_SENTINEL(ctx->bb))) { @@ -290,11 +307,11 @@ apr_status_t ap_core_input_filter(ap_filter_t *f, apr_bucket_brigade *b, } } } - } - /* We can only return at most what we read. */ - if (len < readbytes) { - readbytes = len; + /* We can only return at most what we read. */ + if (len < readbytes) { + readbytes = len; + } } rv = apr_brigade_partition(ctx->bb, readbytes, &e); diff --git a/server/protocol.c b/server/protocol.c index 9ac4e3fe929..e0334722cda 100644 --- a/server/protocol.c +++ b/server/protocol.c @@ -61,6 +61,10 @@ #undef APLOG_MODULE_INDEX #define APLOG_MODULE_INDEX AP_CORE_MODULE_INDEX +#ifndef AP_ASCII_COLON +#define AP_ASCII_COLON '\x3a' +#endif + APR_HOOK_STRUCT( APR_HOOK_LINK(pre_read_request) APR_HOOK_LINK(post_read_request) @@ -210,55 +214,66 @@ AP_DECLARE(apr_time_t) ap_rationalize_mtime(request_rec *r, apr_time_t mtime) * If no LF is detected on the last line due to a dropped connection * or a full buffer, that's considered an error. */ -static apr_status_t ap_fgetline_core(char **s, apr_size_t n, - apr_size_t *read, ap_filter_t *f, - int flags, apr_bucket_brigade *bb, - apr_pool_t *p) +enum folding_state_e { + NOT_FOLDING = 0, + FOLDING_FIND, + FOLDING_READ, + FOLDING_DONE, +}; +struct ap_getline_state { + char *buf; + apr_size_t len; + apr_size_t max_size; + apr_size_t alloc_size; + apr_size_t folding_len; + enum folding_state_e folding_state; + unsigned int folding_col :1, + allocate :1, + reusable :1; +}; +static apr_status_t ap_fgetline_core(ap_getline_state_t *state, + ap_filter_t *f, int flags, + apr_bucket_brigade *bb, + apr_pool_t *p, + int rec) { apr_status_t rv; - apr_bucket *e; - apr_size_t bytes_handled = 0, current_alloc = 0; - char *pos, *last_char = *s; - int do_alloc = (*s == NULL), saw_eos = 0; + apr_read_type_e block; int fold = flags & AP_GETLINE_FOLD; int crlf = flags & AP_GETLINE_CRLF; + int do_alloc = (flags & AP_GETLINE_ALLOC) || state->allocate; int nospc_eol = flags & AP_GETLINE_NOSPC_EOL; - int saw_eol = 0, saw_nospc = 0; - apr_read_type_e block; + apr_status_t late_rv = APR_SUCCESS; + int seen_eol = 0, seen_nospc = 0; + apr_bucket *e; - if (!n) { + state->reusable = 0; /* until further notice */ + + if (state->max_size == 0) { /* Needs room for NUL byte at least */ - *read = 0; return APR_BADARG; } block = (flags & AP_GETLINE_NONBLOCK) ? APR_NONBLOCK_READ : APR_BLOCK_READ; - /* - * Initialize last_char as otherwise a random value will be compared - * against APR_ASCII_LF at the end of the loop if bb only contains - * zero-length buckets. - */ - if (last_char) - *last_char = '\0'; - + if (state->folding_state == FOLDING_FIND) { + /* EAGAIN looking up for folding line, continue there */ + goto find_folding; + } do { apr_brigade_cleanup(bb); rv = ap_get_brigade(f, bb, AP_MODE_GETLINE, block, 0); if (rv != APR_SUCCESS) { goto cleanup; } - - /* Something horribly wrong happened. Someone didn't block! - * (this also happens at the end of each keepalive connection) - * (this also happens when non-blocking is asked too, not that wrong) - */ if (APR_BRIGADE_EMPTY(bb)) { - if (block != APR_NONBLOCK_READ) { + if (block == APR_BLOCK_READ) { + /* Something horribly wrong happened. Someone didn't block! */ rv = APR_EGENERAL; } else { + /* Non blocking (which would block) gets us here */ rv = APR_EAGAIN; } goto cleanup; @@ -271,10 +286,10 @@ static apr_status_t ap_fgetline_core(char **s, apr_size_t n, const char *str; apr_size_t len; - /* If we see an EOS, don't bother doing anything more. */ + /* APR_EOF on EOS (CRLF is missing) */ if (APR_BUCKET_IS_EOS(e)) { - saw_eos = 1; - break; + rv = APR_EOF; + goto cleanup; } rv = apr_bucket_read(e, &str, &len, APR_BLOCK_READ); @@ -282,6 +297,27 @@ static apr_status_t ap_fgetline_core(char **s, apr_size_t n, goto cleanup; } + /* If folding, trim leading blanks */ + if (state->folding_state == FOLDING_READ && len > 0) { + size_t i; + for (i = 0; i < len; ++i) { + const char c = str[i]; + if (c != APR_ASCII_BLANK && c != APR_ASCII_TAB) { + break; + } + } + state->folding_len += i; + ap_assert(state->folding_len > 0); + str += i; + len -= i; + + /* Fail if the line is composed of blanks only */ + if ((len > 0 && str[0] == APR_ASCII_LF) + || (len > 1 && str[0] == APR_ASCII_CR + && str[1] == APR_ASCII_LF)) { + late_rv = APR_EINVAL; + } + } if (len == 0) { /* no use attempting a zero-byte alloc (hurts when * using --with-efence --enable-pool-debug) or @@ -290,11 +326,13 @@ static apr_status_t ap_fgetline_core(char **s, apr_size_t n, continue; } - /* Would this overrun our buffer? If so, we'll die. */ - if (n < bytes_handled + len) { + /* Would this exceed the limit? If so, we'll die. */ + if (state->len + state->folding_len + len >= state->max_size) { + apr_size_t read_len = state->len + state->folding_len; + /* Before we die, let's fill the buffer up to its limit (i.e. * fall through with the remaining length, if any), setting - * saw_eol on LF to stop the outer loop appropriately; we may + * seen_eol on LF to stop the outer loop appropriately; we may * come back here once the buffer is filled (no LF seen), and * either be done at that time or continue to wait for LF here * if nospc_eol is set. @@ -306,248 +344,299 @@ static apr_status_t ap_fgetline_core(char **s, apr_size_t n, * we have to handle the case so that it's not returned to the * caller as part of the truncated line (it's not!). This is * easier to consider that LF is out of counting and thus fall - * through with no error (saw_eol is set to 2 so that we later + * through with no error (seen_eol is set to 2 so that we later * ignore LF handling already done here), while folding and * nospc_eol logics continue to work (or fail) appropriately. */ - saw_eol = (str[len - 1] == APR_ASCII_LF); - if (/* First time around */ - saw_eol && !saw_nospc - /* Single LF completing the buffered CR, */ - && ((len == 1 && ((*s)[bytes_handled - 1] == APR_ASCII_CR)) - /* or trailing CRLF overuns by LF only */ - || (len > 1 && str[len - 2] == APR_ASCII_CR - && n - bytes_handled + 1 == len))) { - /* In both cases *last_char is (to be) the CR stripped by - * later 'bytes_handled = last_char - *s'. - */ - saw_eol = 2; + seen_eol = (str[len - 1] == APR_ASCII_LF); + if (!seen_eol + || seen_nospc + || read_len + len != state->max_size) { + /* Some data lost */ + late_rv = APR_ENOSPC; + seen_nospc = 1; + } + else if ((len == 1 + && state->len > 0 + && state->buf[state->len - 1] == APR_ASCII_CR) + || (len > 1 && str[len - 2] == APR_ASCII_CR)) { + /* CR[LF] is to be stripped */ + seen_eol = 2; } else { - /* In any other case we'd lose data. */ - rv = APR_ENOSPC; - saw_nospc = 1; + /* Single LF to be stripped (or fail if AP_GETLINE_CRLF) */ + AP_DEBUG_ASSERT(seen_eol == 1); } - len = n - bytes_handled; - if (!len) { - if (saw_eol) { - break; - } - if (nospc_eol) { - continue; - } - goto cleanup; + + if (read_len + 1 >= state->max_size) { + /* Full, check loop condition */ + continue; } + + /* Fall through (fill buf up to len) */ + len = state->max_size - read_len - 1; } /* Do we have to handle the allocation ourselves? */ if (do_alloc) { + apr_size_t more_len = len + (state->folding_state == FOLDING_READ); + /* We'll assume the common case where one bucket is enough. */ - if (!*s) { - current_alloc = len; - *s = apr_palloc(p, current_alloc + 1); + if (state->buf == NULL) { + state->alloc_size = more_len + 1; + state->buf = apr_palloc(p, state->alloc_size); } - else if (bytes_handled + len > current_alloc) { + else if (state->len + more_len >= state->alloc_size) { /* Increase the buffer size */ - apr_size_t new_size = current_alloc * 2; + apr_size_t new_size; char *new_buffer; - if (bytes_handled + len > new_size) { - new_size = (bytes_handled + len) * 2; + if (state->alloc_size >= state->max_size / 2) { + new_size = state->max_size; } + else { + new_size = state->alloc_size * 2; + if (state->len + more_len >= new_size) { + new_size = state->len + more_len + 1; + } + } + ap_assert(new_size > state->len + more_len); - new_buffer = apr_palloc(p, new_size + 1); + new_buffer = apr_palloc(p, new_size); /* Copy what we already had. */ - memcpy(new_buffer, *s, bytes_handled); - current_alloc = new_size; - *s = new_buffer; + memcpy(new_buffer, state->buf, state->len); + state->alloc_size = new_size; + state->buf = new_buffer; } } - /* Just copy the rest of the data to the end of the old buffer. */ - pos = *s + bytes_handled; - memcpy(pos, str, len); - last_char = pos + len - 1; - - /* We've now processed that new data - update accordingly. */ - bytes_handled += len; + if (state->folding_state == FOLDING_READ) { + /* Replace all blanks with a single one. */ + state->buf[state->len++] = APR_ASCII_BLANK; + state->folding_state = FOLDING_DONE; + } + /* Just copy new data to the end of the buffer. */ + memcpy(state->buf + state->len, str, len); + state->len += len; } /* If we got a full line of input, stop reading */ - if (last_char && (*last_char == APR_ASCII_LF)) { - saw_eol = 1; + if (state->len && state->buf[state->len - 1] == APR_ASCII_LF) { + seen_eol = 1; } - } while (!saw_eol); + } while (!seen_eol && (!seen_nospc || nospc_eol)); - if (rv != APR_SUCCESS) { - /* End of line after APR_ENOSPC above */ + if (late_rv != APR_SUCCESS) { + rv = late_rv; + goto cleanup; + } + if (state->folding_state == FOLDING_READ) { + /* Folding is blank only */ + rv = APR_EINVAL; goto cleanup; } /* Now terminate the string at the end of the line; * if the last-but-one character is a CR, terminate there. - * LF is handled above (not accounted) when saw_eol == 2, + * LF is handled above (not accounted) when seen_eol == 2, * the last char is CR to terminate at still. */ - if (saw_eol < 2) { - if (last_char > *s && last_char[-1] == APR_ASCII_CR) { - last_char--; + state->len--; + if (seen_eol != 2) { + if (state->len && state->buf[state->len - 1] == APR_ASCII_CR) { + state->len--; } else if (crlf) { rv = APR_EINVAL; goto cleanup; } } - bytes_handled = last_char - *s; - /* If we're folding, we have more work to do. + /* If we have to search for folding, we have more work to do. + * If folding already, let the (recursive) caller loop for the next + * folding line if any and thus issue terminal recursions only. * - * Note that if an EOS was seen, we know we can't have another line. + * Note that if an empty line or an EOS was seen, we know we can't have + * another line. */ - if (fold && bytes_handled && !saw_eos) { + if (fold && !state->folding_state && state->len) { + state->folding_state = FOLDING_FIND; +find_folding: + flags &= ~AP_GETLINE_FOLD; for (;;) { const char *str; apr_size_t len; - char c; - - /* Clear the temp brigade for this filter read. */ - apr_brigade_cleanup(bb); + char c = 0; /* We only care about the first byte. */ + apr_brigade_cleanup(bb); rv = ap_get_brigade(f, bb, AP_MODE_SPECULATIVE, block, 1); if (rv != APR_SUCCESS) { goto cleanup; } - if (APR_BRIGADE_EMPTY(bb)) { + if (block != APR_NONBLOCK_READ) { + rv = APR_EGENERAL; + } + else { + rv = APR_EAGAIN; + } break; } + do { + e = APR_BRIGADE_FIRST(bb); - e = APR_BRIGADE_FIRST(bb); - - /* If we see an EOS, don't bother doing anything more. */ - if (APR_BUCKET_IS_EOS(e)) { - break; - } - - rv = apr_bucket_read(e, &str, &len, APR_BLOCK_READ); - if (rv != APR_SUCCESS) { - apr_brigade_cleanup(bb); - goto cleanup; - } - - /* Found one, so call ourselves again to get the next line. - * - * FIXME: If the folding line is completely blank, should we - * stop folding? Does that require also looking at the next - * char? - */ - /* When we call destroy, the buckets are deleted, so save that - * one character we need. This simplifies our execution paths - * at the cost of one character read. - */ - c = *str; - if (c == APR_ASCII_BLANK || c == APR_ASCII_TAB) { - /* Do we have enough space? We may be full now. */ - if (bytes_handled >= n) { - rv = APR_ENOSPC; + /* APR_EOF on EOS (CRLF is missing) */ + if (APR_BUCKET_IS_EOS(e)) { + rv = APR_EOF; goto cleanup; } - else { - apr_size_t next_size, next_len; - char *tmp; - /* If we're doing the allocations for them, we have to - * give ourselves a NULL and copy it on return. - */ - if (do_alloc) { - tmp = NULL; - } - else { - tmp = last_char; - } - - next_size = n - bytes_handled; - - rv = ap_fgetline_core(&tmp, next_size, &next_len, f, - flags & ~AP_GETLINE_FOLD, bb, p); - if (rv != APR_SUCCESS) { - goto cleanup; - } - - if (do_alloc && next_len > 0) { - char *new_buffer; - apr_size_t new_size = bytes_handled + next_len + 1; - - /* we need to alloc an extra byte for a null */ - new_buffer = apr_palloc(p, new_size); + rv = apr_bucket_read(e, &str, &len, APR_BLOCK_READ); + if (rv != APR_SUCCESS) { + goto cleanup; + } + if (len > 0) { + c = *str; + break; + } - /* Copy what we already had. */ - memcpy(new_buffer, *s, bytes_handled); + apr_bucket_delete(e); + } while (!APR_BRIGADE_EMPTY(bb)); - /* copy the new line, including the trailing null */ - memcpy(new_buffer + bytes_handled, tmp, next_len); - *s = new_buffer; - } + if (APR_BRIGADE_EMPTY(bb)) { + /* No useful data, continue reading */ + continue; + } + if (c != APR_ASCII_BLANK && c != APR_ASCII_TAB) { + /* Not a continuation line */ + state->folding_state = NOT_FOLDING; + state->folding_col = 0; + break; + } - last_char += next_len; - bytes_handled += next_len; + /* Found one, may be allowed after a colon char only */ + if ((flags & AP_GETLINE_FOLD_COL) && !state->folding_col) { + if (!memchr(state->buf, AP_ASCII_COLON, state->len)) { + rv = APR_EINVAL; + goto cleanup; } + state->folding_col = 1; } - else { /* next character is not tab or space */ - break; + + /* Before folding, trim trailing blanks */ + while (state->len + && (state->buf[state->len - 1] == APR_ASCII_BLANK + || state->buf[state->len - 1] == APR_ASCII_TAB)) { + state->folding_len++; + state->len--; + } + + /* Call ourselves again to get the next line. */ + state->folding_state = FOLDING_READ; + rv = ap_fgetline_core(state, f, flags, bb, p, 1); + if (rv != APR_SUCCESS) { + goto cleanup; } + state->folding_state = FOLDING_FIND; } } cleanup: - if (bytes_handled >= n) { - bytes_handled = n - 1; + if (rec) { + /* On recursion, let the caller do the finalization */ + return rv; } + if (state->buf) { + apr_size_t len; - *read = bytes_handled; - if (*s) { /* ensure the string is NUL terminated */ - (*s)[*read] = '\0'; + state->buf[state->len] = '\0'; /* PR#43039: We shouldn't accept NULL bytes within the line */ - bytes_handled = strlen(*s); - if (bytes_handled < *read) { + len = strlen(state->buf); + if (len < state->len) { ap_log_data(APLOG_MARK, APLOG_DEBUG, ap_server_conf, - "NULL bytes in header", *s, *read, 0); - *read = bytes_handled; + "NULL bytes in header", state->buf, state->len, 0); if (rv == APR_SUCCESS) { rv = APR_EINVAL; } + state->len = len; } } + if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) { + state->reusable = 1; + rv = APR_EAGAIN; + } + apr_brigade_cleanup(bb); return rv; } -AP_DECLARE(apr_status_t) ap_fgetline(char **s, apr_size_t n, - apr_size_t *read, ap_filter_t *f, - int flags, apr_bucket_brigade *bb, - apr_pool_t *p) +AP_DECLARE(apr_status_t) ap_fgetline_ex(char **s, apr_size_t n, + apr_size_t *read, ap_filter_t *f, + int flags, apr_bucket_brigade *bb, + ap_getline_state_t **state_p, + apr_pool_t *p) { apr_status_t rv; - - rv = ap_fgetline_core(s, n, read, f, flags, bb, p); + ap_getline_state_t *state = *state_p; +#if APR_CHARSET_EBCDIC + apr_size_t prev_len = 0; +#endif + if (!state || !state->reusable) { + if (!state) { + *state_p = state = apr_pcalloc(p, sizeof(*state)); + } + else { + memset(state, 0, sizeof(*state)); + } + if (*s && !(flags & AP_GETLINE_ALLOC)) { + state->buf = *s; + } + else { + state->allocate = 1; + *s = NULL; + } + state->max_size = n; + } +#if APR_CHARSET_EBCDIC + else { + prev_len = state->len; + } +#endif + + rv = ap_fgetline_core(state, f, flags, bb, p, 0); + + *s = state->buf; + *read = state->len; #if APR_CHARSET_EBCDIC /* On EBCDIC boxes, each complete http protocol input line needs to be * translated into the code page used by the compiler. Since * ap_fgetline_core uses recursion, we do the translation in a wrapper * function to ensure that each input character gets translated only once. */ - if (*read) { - ap_xlate_proto_from_ascii(*s, *read); + if (*read > prev_len) { + ap_xlate_proto_from_ascii(*s + prev_len, *read - prev_len); } #endif return rv; } +AP_DECLARE(apr_status_t) ap_fgetline(char **s, apr_size_t n, + apr_size_t *read, ap_filter_t *f, + int flags, apr_bucket_brigade *bb, + apr_pool_t *p) +{ + ap_getline_state_t stack_state; + ap_getline_state_t *state = &stack_state; + state->reusable = 0; + + return ap_fgetline_ex(s, n, read, f, flags, bb, &state, p); +} + /* Same as ap_fgetline(), working on r's pool and protocol input filters. * Pulls from r->proto_input_filters instead of r->input_filters for * stricter protocol adherence and better input filter behavior during @@ -557,22 +646,8 @@ AP_DECLARE(apr_status_t) ap_rgetline(char **s, apr_size_t n, apr_size_t *read, request_rec *r, int flags, apr_bucket_brigade *bb) { - apr_status_t rv; - - rv = ap_fgetline_core(s, n, read, r->proto_input_filters, flags, - bb, r->pool); -#if APR_CHARSET_EBCDIC - /* On EBCDIC boxes, each complete http protocol input line needs to be - * translated into the code page used by the compiler. Since - * ap_fgetline_core uses recursion, we do the translation in a wrapper - * function to ensure that each input character gets translated only once. - */ - if (*read) { - ap_xlate_proto_from_ascii(*s, *read); - } -#endif - - return rv; + return ap_fgetline(s, n, read, r->proto_input_filters, + flags, bb, r->pool); } AP_DECLARE(int) ap_getline(char *s, int n, request_rec *r, int flags) @@ -790,30 +865,40 @@ static int table_do_fn_check_lengths(void *r_, const char *key, return 0; } -AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb) +AP_DECLARE(apr_status_t) ap_get_mime_headers_ex(request_rec *r, + ap_filter_t *f, + apr_read_type_e block, + apr_bucket_brigade *bb, + ap_getline_state_t **state_p) { - char *last_field = NULL; - apr_size_t last_len = 0; - apr_size_t alloc_len = 0; - char *field; - char *value; - apr_size_t len; - int fields_read = 0; - char *tmp_field; + apr_status_t rv = APR_SUCCESS; core_server_config *conf = ap_get_core_module_config(r->server->module_config); int strict = (conf->http_conformance != AP_HTTP_CONFORMANCE_UNSAFE); + apr_size_t max_size = r->server->limit_req_fieldsize + 1; + int flags = AP_GETLINE_ALLOC | AP_GETLINE_FOLD_COL; + int fields_read = 0; + + if (strict) { + flags |= AP_GETLINE_CRLF; + } + if (block == APR_NONBLOCK_READ) { + flags |= AP_GETLINE_NONBLOCK; + } /* * Read header lines until we get the empty separator line, a read error, * the connection closes (EOF), reach the server limit, or we timeout. */ while(1) { - apr_status_t rv; - - field = NULL; - rv = ap_rgetline(&field, r->server->limit_req_fieldsize + 2, - &len, r, strict ? AP_GETLINE_CRLF : 0, bb); + char *field = NULL; + apr_size_t len = 0; + /* max_size + 2 for CRLF */ + rv = ap_fgetline_ex(&field, max_size + 2, &len, f, flags, bb, + state_p, r->pool); + if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) { + goto cleanup; + } if (rv != APR_SUCCESS) { if (APR_STATUS_IS_TIMEUP(rv)) { r->status = HTTP_REQUEST_TIME_OUT; @@ -822,7 +907,7 @@ AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb r->status = HTTP_BAD_REQUEST; } - /* ap_rgetline returns APR_ENOSPC if it fills up the buffer before + /* ap_fgetline returns APR_ENOSPC if it fills up the buffer before * finding the end-of-line. This is only going to happen if it * exceeds the configured limit for a field size. */ @@ -837,7 +922,12 @@ AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb (field) ? field_name_len(field) : 0, (field) ? field : ""); } - return; + goto cleanup; + } + + /* Found the terminating empty end-of-headers line, stop. */ + if (len == 0) { + break; } /* For all header values, and all obs-fold lines, the presence of @@ -849,82 +939,11 @@ AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb field[--len] = '\0'; } - if (*field == '\t' || *field == ' ') { - - /* Append any newly-read obs-fold line onto the preceding - * last_field line we are processing - */ - apr_size_t fold_len; - - if (last_field == NULL) { - r->status = HTTP_BAD_REQUEST; - ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(03442) - "Line folding encountered before first" - " header line"); - return; - } - - if (field[1] == '\0') { - r->status = HTTP_BAD_REQUEST; - ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(03443) - "Empty folded line encountered"); - return; - } - - /* Leading whitespace on an obs-fold line can be - * similarly discarded */ - while (field[1] == '\t' || field[1] == ' ') { - ++field; --len; - } - - /* This line is a continuation of the preceding line(s), - * so append it to the line that we've set aside. - * Note: this uses a power-of-two allocator to avoid - * doing O(n) allocs and using O(n^2) space for - * continuations that span many many lines. - */ - fold_len = last_len + len + 1; /* trailing null */ - - if (fold_len >= (apr_size_t)(r->server->limit_req_fieldsize)) { - r->status = HTTP_BAD_REQUEST; - /* report what we have accumulated so far before the - * overflow (last_field) as the field with the problem - */ - apr_table_setn(r->notes, "error-notes", - "Size of a request header field " - "exceeds server limit."); - ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, APLOGNO(00562) - "Request header exceeds LimitRequestFieldSize " - "after folding: %.*s", - field_name_len(last_field), last_field); - return; - } - - if (fold_len > alloc_len) { - char *fold_buf; - alloc_len += alloc_len; - if (fold_len > alloc_len) { - alloc_len = fold_len; - } - fold_buf = (char *)apr_palloc(r->pool, alloc_len); - memcpy(fold_buf, last_field, last_len); - last_field = fold_buf; - } - memcpy(last_field + last_len, field, len +1); /* +1 for nul */ - /* Replace obs-fold w/ SP per RFC 7230 3.2.4 */ - last_field[last_len] = ' '; - last_len += len; - - /* We've appended this obs-fold line to last_len, proceed to - * read the next input line - */ - continue; - } - else if (last_field != NULL) { + { + char *value; - /* Process the previous last_field header line with all obs-folded - * segments already concatenated (this is not operating on the - * most recently read input line). + /* Process the header line with all obs-folded segments already + * concatenated. */ if (r->server->limit_req_fields @@ -936,37 +955,40 @@ AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, APLOGNO(00563) "Number of request headers exceeds " "LimitRequestFields"); - return; + rv = APR_ENOSPC; + goto cleanup; } - if (!strict) - { + if (!strict) { /* Not Strict ('Unsafe' mode), using the legacy parser */ - if (!(value = strchr(last_field, ':'))) { /* Find ':' or */ + if (!(value = strchr(field, ':'))) { /* Find ':' or */ r->status = HTTP_BAD_REQUEST; /* abort bad request */ ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(00564) "Request header field is missing ':' " "separator: %.*s", (int)LOG_NAME_MAX_LEN, - last_field); - return; + field); + rv = APR_EINVAL; + goto cleanup; } - if (value == last_field) { + if (value == field) { r->status = HTTP_BAD_REQUEST; ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(03453) "Request header field name was empty"); - return; + rv = APR_EINVAL; + goto cleanup; } *value++ = '\0'; /* NUL-terminate at colon */ - if (strpbrk(last_field, "\t\n\v\f\r ")) { + if (strpbrk(field, "\t\n\v\f\r ")) { r->status = HTTP_BAD_REQUEST; ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(03452) "Request header field name presented" " invalid whitespace"); - return; + rv = APR_EINVAL; + goto cleanup; } while (*value == ' ' || *value == '\t') { @@ -978,64 +1000,51 @@ AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(03451) "Request header field value presented" " bad whitespace"); - return; + rv = APR_EINVAL; + goto cleanup; } } - else /* Using strict RFC7230 parsing */ - { + else { + /* Using strict RFC7230 parsing */ + /* Ensure valid token chars before ':' per RFC 7230 3.2.4 */ - value = (char *)ap_scan_http_token(last_field); - if ((value == last_field) || *value != ':') { + value = (char *)ap_scan_http_token(field); + if ((value == field) || *value != ':') { r->status = HTTP_BAD_REQUEST; ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(02426) "Request header field name is malformed: " - "%.*s", (int)LOG_NAME_MAX_LEN, last_field); - return; + "%.*s", (int)LOG_NAME_MAX_LEN, field); + rv = APR_EINVAL; + goto cleanup; } - *value++ = '\0'; /* NUL-terminate last_field name at ':' */ + *value++ = '\0'; /* NUL-terminate field name at ':' */ while (*value == ' ' || *value == '\t') { ++value; /* Skip LWS of value */ } - /* Find invalid, non-HT ctrl char, or the trailing NULL */ - tmp_field = (char *)ap_scan_http_field_content(value); - /* Reject value for all garbage input (CTRLs excluding HT) * e.g. only VCHAR / SP / HT / obs-text are allowed per * RFC7230 3.2.6 - leave all more explicit rule enforcement * for specific header handler logic later in the cycle */ - if (*tmp_field != '\0') { + if (*ap_scan_http_field_content(value) != '\0') { r->status = HTTP_BAD_REQUEST; ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(02427) "Request header value is malformed: " "%.*s", (int)LOG_NAME_MAX_LEN, value); - return; + rv = APR_EINVAL; + goto cleanup; } } - apr_table_addn(r->headers_in, last_field, value); + apr_table_addn(r->headers_in, field, value); - /* This last_field header is now stored in headers_in, + /* This field header is now stored in headers_in, * resume processing of the current input line. */ } - - /* Found the terminating empty end-of-headers line, stop. */ - if (len == 0) { - break; - } - - /* Keep track of this new header line so that we can extend it across - * any obs-fold or parse it on the next loop iteration. We referenced - * our previously allocated buffer in r->headers_in, - * so allocate a fresh buffer if required. - */ - alloc_len = 0; - last_field = field; - last_len = len; } /* Combine multiple message-header fields with the same @@ -1045,14 +1054,25 @@ AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb /* enforce LimitRequestFieldSize for merged headers */ apr_table_do(table_do_fn_check_lengths, r, r->headers_in, NULL); + +cleanup: + apr_brigade_cleanup(bb); + return rv; +} + +AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb) +{ + ap_getline_state_t *state = NULL; + (void)ap_get_mime_headers_ex(r, r->proto_input_filters, APR_BLOCK_READ, + bb, &state); } AP_DECLARE(void) ap_get_mime_headers(request_rec *r) { - apr_bucket_brigade *tmp_bb; - tmp_bb = apr_brigade_create(r->pool, r->connection->bucket_alloc); + conn_rec *c = r->connection; + apr_bucket_brigade *tmp_bb = ap_acquire_brigade(c); ap_get_mime_headers_core(r, tmp_bb); - apr_brigade_destroy(tmp_bb); + ap_release_brigade(c, tmp_bb); } AP_DECLARE(request_rec *) ap_create_request(conn_rec *conn) @@ -1305,23 +1325,42 @@ AP_DECLARE(int) ap_assign_request_line(request_rec *r, AP_DECLARE(request_rec *) ap_read_request(conn_rec *conn) { + request_rec *r = NULL; + (void)ap_read_request_ex(&r, conn, APR_BLOCK_READ); + return r; +} + +AP_DECLARE(apr_status_t) ap_read_request_ex(request_rec **out_r, conn_rec *conn, + apr_read_type_e block) +{ + apr_status_t rv; int access_status; apr_bucket_brigade *tmp_bb; - apr_bucket *e, *bdata = NULL, *berr = NULL; + apr_bucket *e, *bdata = NULL; + ap_bucket_error *berr = NULL; ap_bucket_request *breq = NULL; const char *method, *uri, *protocol; apr_table_t *headers; - apr_status_t rv; - - request_rec *r = ap_create_request(conn); + request_rec *r; - tmp_bb = apr_brigade_create(r->pool, r->connection->bucket_alloc); - conn->keepalive = AP_CONN_UNKNOWN; + r = conn->partial_request; + if (conn->keepalive == AP_CONN_KEEPALIVE) { + conn->keepalive = AP_CONN_UNKNOWN; + } + if (!r) { + r = ap_create_request(conn); + ap_run_pre_read_request(r, conn); + r->request_time = apr_time_now(); + } - ap_run_pre_read_request(r, conn); + tmp_bb = ap_acquire_brigade(conn); - r->request_time = apr_time_now(); - rv = ap_get_brigade(r->proto_input_filters, tmp_bb, AP_MODE_READBYTES, APR_BLOCK_READ, 0); + rv = ap_get_brigade(r->proto_input_filters, tmp_bb, AP_MODE_READBYTES, block, 0); + if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) { + conn->partial_request = r; + r = NULL; + goto done; + } if (rv != APR_SUCCESS || APR_BRIGADE_EMPTY(tmp_bb)) { /* Not worth dying with. */ conn->keepalive = AP_CONN_CLOSE; @@ -1337,7 +1376,7 @@ AP_DECLARE(request_rec *) ap_read_request(conn_rec *conn) if (!breq) breq = e->data; } else if (AP_BUCKET_IS_ERROR(e)) { - if (!berr) berr = e; + if (!berr) berr = e->data; } else if (!APR_BUCKET_IS_METADATA(e) && e->length != 0) { if (!bdata) bdata = e; @@ -1345,16 +1384,11 @@ AP_DECLARE(request_rec *) ap_read_request(conn_rec *conn) } } - if (!breq && !berr) { - ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(10389) - "request failed: neither request bucket nor error at start of input"); - access_status = HTTP_INTERNAL_SERVER_ERROR; - goto die_unusable_input; - } - + /* If there is a request, we always process it, as it defines + * the context in which a potential error bucket is handled. */ if (breq) { - /* If there is a request, we always process it, as it defines - * the context in which a potential error bucket is handled. */ + conn->partial_request = NULL; + if (apr_pool_is_ancestor(r->pool, breq->pool)) { method = breq->method; uri = breq->uri; @@ -1369,8 +1403,7 @@ AP_DECLARE(request_rec *) ap_read_request(conn_rec *conn) } if (!method || !uri || !protocol) { - access_status = berr? ((ap_bucket_error *)(berr->data))->status : - HTTP_INTERNAL_SERVER_ERROR; + access_status = berr ? berr->status : HTTP_INTERNAL_SERVER_ERROR; goto die_unusable_input; } @@ -1414,20 +1447,31 @@ AP_DECLARE(request_rec *) ap_read_request(conn_rec *conn) goto ignore; } } - if (berr) { - access_status = ((ap_bucket_error *)(berr->data))->status; + /* APLOG_ERR already raised by filters (eventually). */ + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(10467) + "request failed: error %i at start of input", + berr->status); + access_status = berr->status; goto die_unusable_input; } - else if (bdata) { + if (!breq) { + ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(10389) + "request failed: neither request bucket nor error " + "at start of input"); + access_status = HTTP_INTERNAL_SERVER_ERROR; + goto die_unusable_input; + } + if (bdata) { /* Since processing of a request body depends on knowing the request, we * cannot handle any data here. For example, chunked-encoding filters are * added after the request is read, so any data buckets here will not * have been de-chunked. */ ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(10391) - "request failed: seeing DATA bucket(len=%d) of request " - "body, too early to process", (int)bdata->length); + "request failed: seeing DATA bucket (len=%" APR_SIZE_T_FMT ") " + "of request body, too early to process", + bdata->length); access_status = HTTP_INTERNAL_SERVER_ERROR; goto die_unusable_input; } @@ -1480,7 +1524,9 @@ AP_DECLARE(request_rec *) ap_read_request(conn_rec *conn) AP_READ_REQUEST_SUCCESS((uintptr_t)r, (char *)r->method, (char *)r->uri, (char *)r->server->defn_name, r->status); - return r; +done: + ap_release_brigade(conn, tmp_bb); + return (*out_r = r) ? APR_SUCCESS : APR_EAGAIN; /* Everything falls through on failure */ @@ -1523,9 +1569,10 @@ AP_DECLARE(request_rec *) ap_read_request(conn_rec *conn) } ignore: - r = NULL; + ap_release_brigade(conn, tmp_bb); + *out_r = conn->partial_request = r = NULL; AP_READ_REQUEST_FAILURE((uintptr_t)r); - return NULL; + return APR_EGENERAL; } AP_DECLARE(int) ap_post_read_request(request_rec *r) From 6eee3f3c338292f4782d71bd07094e90edfe18d7 Mon Sep 17 00:00:00 2001 From: ylavic Date: Thu, 11 Jul 2024 15:24:36 +0200 Subject: [PATCH 21/22] mod_proxy,mpm_event: Replace ap_mpm_register_poll_callback*() by ap_mpm_poll_suspended() to avoid races. --- include/ap_mmn.h | 8 +- include/ap_mpm.h | 51 +-- include/httpd.h | 2 +- include/mpm_common.h | 18 +- modules/http/http_core.c | 20 +- modules/proxy/mod_proxy_http.c | 266 ++++++++++------ modules/proxy/mod_proxy_wstunnel.c | 220 +++++++------ modules/proxy/proxy_util.c | 2 +- server/mpm/event/event.c | 487 ++++++++++++++++------------- server/mpm_common.c | 39 +-- server/mpm_fdqueue.h | 1 - 11 files changed, 605 insertions(+), 509 deletions(-) diff --git a/include/ap_mmn.h b/include/ap_mmn.h index fb8f4512d47..aac4e1a3401 100644 --- a/include/ap_mmn.h +++ b/include/ap_mmn.h @@ -735,14 +735,18 @@ * ap_check_output_pending() * 20211221.27 (2.5.1-dev) Add min_connection_timeout hook and * ap_get_connection_timeout() + * 20211221.28 (2.5.1-dev) Add ap_mpm_poll_suspended() and + * AP_MPMQ_CAN_POLL_SUSPENDED + * 20240701.0 (2.5.1-dev) Axe ap_mpm_register_poll_callback and + * ap_mpm_register_poll_callback_timeout */ #define MODULE_MAGIC_COOKIE 0x41503235UL /* "AP25" */ #ifndef MODULE_MAGIC_NUMBER_MAJOR -#define MODULE_MAGIC_NUMBER_MAJOR 20211221 +#define MODULE_MAGIC_NUMBER_MAJOR 20240701 #endif -#define MODULE_MAGIC_NUMBER_MINOR 27 /* 0...n */ +#define MODULE_MAGIC_NUMBER_MINOR 0 /* 0...n */ /** * Determine if the server's current MODULE_MAGIC_NUMBER is at least a diff --git a/include/ap_mpm.h b/include/ap_mpm.h index f2fd436d508..9a7ec6eeaa3 100644 --- a/include/ap_mpm.h +++ b/include/ap_mpm.h @@ -184,6 +184,8 @@ AP_DECLARE(apr_status_t) ap_os_create_privileged_process( #define AP_MPMQ_CAN_POLL 18 /** MPM supports CONN_STATE_ASYNC_WAITIO */ #define AP_MPMQ_CAN_WAITIO 19 +/** MPM implements the poll_suspended hook */ +#define AP_MPMQ_CAN_POLL_SUSPENDED 20 /** @} */ /** @@ -206,54 +208,13 @@ typedef void (ap_mpm_callback_fn_t)(void *baton); /* only added support in the Event MPM.... check for APR_ENOTIMPL */ AP_DECLARE(apr_status_t) ap_mpm_resume_suspended(conn_rec *c); /* only added support in the Event MPM.... check for APR_ENOTIMPL */ +AP_DECLARE(apr_status_t) ap_mpm_poll_suspended(conn_rec *c, apr_pool_t *p, + const apr_array_header_t *pfds, + apr_interval_time_t timeout); +/* only added support in the Event MPM.... check for APR_ENOTIMPL */ AP_DECLARE(apr_status_t) ap_mpm_register_timed_callback( apr_time_t t, ap_mpm_callback_fn_t *cbfn, void *baton); -/** - * Register a callback on the readability or writability on a group of - * sockets/pipes. - * @param p Pool used by the MPM for its internal allocations - * @param pfds Array of apr_pollfd_t - * @param cbfn The callback function - * @param baton userdata for the callback function - * @return APR_SUCCESS if all sockets/pipes could be added to a pollset, - * APR_ENOTIMPL if no asynch support, or an apr_pollset_add error. - * @remark When activity is found on any 1 socket/pipe in the list, all are removed - * from the pollset and only 1 callback is issued. - * @remark The passed in pool can be cleared by cbfn and tofn when called back, - * it retains no MPM persistent data and won't be used until the next call - * to ap_mpm_register_poll_callback[_timeout]. - */ - -AP_DECLARE(apr_status_t) ap_mpm_register_poll_callback( - apr_pool_t *p, const apr_array_header_t *pfds, - ap_mpm_callback_fn_t *cbfn, void *baton); - -/** - * Register a callback on the readability or writability on a group of sockets/pipes, - * with a timeout. - * @param p Pool used by the MPM for its internal allocations - * @param pfds Array of apr_pollfd_t - * @param cbfn The callback function - * @param tofn The callback function if the timeout expires - * @param baton userdata for the callback function - * @param timeout timeout for I/O in microseconds, unlimited if <= 0 - * @return APR_SUCCESS if all sockets/pipes could be added to a pollset, - * APR_ENOTIMPL if no asynch support, or an apr_pollset_add error. - * @remark When activity is found on any 1 socket/pipe in the list, all are removed - * from the pollset and only 1 callback is issued. - * @remark For each call, only one of tofn or cbfn will be called, never both. - * @remark The passed in pool can be cleared by cbfn and tofn when called back, - * it retains no MPM persistent data and won't be used until the next call - * to ap_mpm_register_poll_callback[_timeout]. - */ - -AP_DECLARE(apr_status_t) ap_mpm_register_poll_callback_timeout( - apr_pool_t *p, const apr_array_header_t *pfds, - ap_mpm_callback_fn_t *cbfn, ap_mpm_callback_fn_t *tofn, - void *baton, apr_time_t timeout); - - typedef enum mpm_child_status { MPM_CHILD_STARTED, MPM_CHILD_EXITED, diff --git a/include/httpd.h b/include/httpd.h index ae08740b227..931f5fff49a 100644 --- a/include/httpd.h +++ b/include/httpd.h @@ -1334,7 +1334,7 @@ typedef enum { CONN_STATE_PROCESSING, /* Processed by process_connection hooks */ CONN_STATE_HANDLER, /* Processed by the modules handlers */ CONN_STATE_WRITE_COMPLETION, /* Flushed by the MPM before entering CONN_STATE_KEEPALIVE */ - CONN_STATE_SUSPENDED, /* Suspended in the MPM until ap_run_resume_suspended() */ + CONN_STATE_SUSPENDED, /* Suspended from the MPM until ap_run_resume_suspended() */ CONN_STATE_LINGER, /* MPM flushes then closes the connection with lingering */ CONN_STATE_LINGER_NORMAL, /* MPM has started lingering close with normal timeout */ CONN_STATE_LINGER_SHORT, /* MPM has started lingering close with short timeout */ diff --git a/include/mpm_common.h b/include/mpm_common.h index 34c61e2a6c2..43320b2b5c9 100644 --- a/include/mpm_common.h +++ b/include/mpm_common.h @@ -422,22 +422,12 @@ AP_DECLARE_HOOK(int, mpm_query, (int query_code, int *result, apr_status_t *rv)) AP_DECLARE_HOOK(apr_status_t, mpm_register_timed_callback, (apr_time_t t, ap_mpm_callback_fn_t *cbfn, void *baton)) -/** - * register the specified callback - * @ingroup hooks - */ -AP_DECLARE_HOOK(apr_status_t, mpm_register_poll_callback, - (apr_pool_t *p, const apr_array_header_t *pds, - ap_mpm_callback_fn_t *cbfn, void *baton)) - -/* register the specified callback, with timeout +/** Put suspended connection's pollfds into the MPM's pollset * @ingroup hooks - * */ -AP_DECLARE_HOOK(apr_status_t, mpm_register_poll_callback_timeout, - (apr_pool_t *p, const apr_array_header_t *pds, - ap_mpm_callback_fn_t *cbfn, ap_mpm_callback_fn_t *tofn, - void *baton, apr_time_t timeout)) +AP_DECLARE_HOOK(apr_status_t, mpm_poll_suspended, + (conn_rec *c, apr_pool_t *p, const apr_array_header_t *pfds, + apr_interval_time_t timeout)) /** Resume the suspended connection * @ingroup hooks diff --git a/modules/http/http_core.c b/modules/http/http_core.c index 7e9f82f87dd..92a472d3fa7 100644 --- a/modules/http/http_core.c +++ b/modules/http/http_core.c @@ -182,20 +182,22 @@ static int ap_process_http_async_connection(conn_rec *c) * of nondeterministic failures later. */ r = NULL; - } - if (cs->state != CONN_STATE_WRITE_COMPLETION && - cs->state != CONN_STATE_SUSPENDED && - cs->state != CONN_STATE_LINGER) { - /* Something went wrong; close the connection */ - cs->state = CONN_STATE_LINGER; + switch (cs->state) { + case CONN_STATE_WRITE_COMPLETION: + case CONN_STATE_SUSPENDED: + case CONN_STATE_LINGER: + return OK; + default: + /* Unexpected, close */ + break; + } } } - else { /* ap_read_request failed - client may have closed */ - cs->state = CONN_STATE_LINGER; - } } + /* Something went wrong; close the connection */ + cs->state = CONN_STATE_LINGER; return OK; } diff --git a/modules/proxy/mod_proxy_http.c b/modules/proxy/mod_proxy_http.c index 38da5b0f7f6..66a66af7949 100644 --- a/modules/proxy/mod_proxy_http.c +++ b/modules/proxy/mod_proxy_http.c @@ -19,9 +19,12 @@ #include "mod_proxy.h" #include "ap_regex.h" #include "ap_mpm.h" +#include "mpm_common.h" module AP_MODULE_DECLARE_DATA proxy_http_module; +static int mpm_can_poll_suspended = 0; + static int (*ap_proxy_clear_connection_fn)(request_rec *r, apr_table_t *headers) = NULL; @@ -275,12 +278,6 @@ static void add_cl(apr_pool_t *p, #define MAX_MEM_SPOOL 16384 -typedef enum { - PROXY_HTTP_REQ_HAVE_HEADER = 0, - - PROXY_HTTP_TUNNELING -} proxy_http_state; - typedef enum { RB_INIT = 0, RB_STREAM_CL, @@ -307,7 +304,6 @@ typedef struct { char *old_cl_val, *old_te_val; apr_off_t cl_val; - proxy_http_state state; rb_methods rb_method; const char *upgrade; @@ -316,108 +312,148 @@ typedef struct { apr_pool_t *async_pool; apr_interval_time_t idle_timeout; - unsigned int can_go_async :1, + unsigned int can_suspend :1, do_100_continue :1, prefetch_nonblocking :1, - force10 :1; + force10 :1, + suspended :1, + upgraded :1; } proxy_http_req_t; -static void proxy_http_async_finish(proxy_http_req_t *req) +static int proxy_http_tunnel_pump(proxy_http_req_t *req) +{ + int status = ap_proxy_tunnel_run(req->tunnel); + if (status == HTTP_GATEWAY_TIME_OUT) { + if (!req->can_suspend) { + /* ap_proxy_tunnel_run() didn't log this */ + ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, req->r, APLOGNO() + "proxy: %s tunneling timed out", + req->proto); + } + else { + status = SUSPENDED; + } + } + return status; +} + +/* The backend and SUSPENDED client connections are done, + * release them (the latter in the MPM). + */ +static void proxy_http_async_done(proxy_http_req_t *req, int cancelled) { - conn_rec *c = req->r->connection; + request_rec *r = req->r; + conn_rec *c = r->connection; + proxy_conn_rec *backend = req->backend; + proxy_tunnel_rec *tunnel = req->tunnel; + int reusable = (!cancelled && !req->upgraded); + + ap_log_rerror(APLOG_MARK, APLOG_TRACE5, 0, r, "proxy %s: %s async", + req->proto, cancelled ? "cancel" : "finish"); + + if (req->async_pool) { + apr_pool_destroy(req->async_pool); + req->async_pool = NULL; + } - ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, req->r, - "proxy %s: finish async", req->proto); + if (!reusable) { + c->keepalive = AP_CONN_CLOSE; + backend->close = 1; + } /* Report bytes exchanged by the backend */ - req->backend->worker->s->read += - ap_proxy_tunnel_conn_bytes_in(req->tunnel->origin); - req->backend->worker->s->transferred += - ap_proxy_tunnel_conn_bytes_out(req->tunnel->origin); + backend->worker->s->read += + ap_proxy_tunnel_conn_bytes_in(tunnel->origin); + backend->worker->s->transferred += + ap_proxy_tunnel_conn_bytes_out(tunnel->origin); - proxy_run_detach_backend(req->r, req->backend); - ap_proxy_release_connection(req->proto, req->backend, req->r->server); + proxy_run_detach_backend(r, backend); + ap_proxy_release_connection(req->proto, backend, r->server); - ap_finalize_request_protocol(req->r); - ap_process_request_after_handler(req->r); - /* don't touch req or req->r from here */ + ap_finalize_request_protocol(r); + ap_process_request_after_handler(r); + /* don't dereference req or r from here! */ - c->cs->state = CONN_STATE_LINGER; + /* Return the client connection to the MPM */ + if (reusable) { + c->cs->state = CONN_STATE_WRITE_COMPLETION; + } + else { + c->cs->state = CONN_STATE_LINGER; + } ap_mpm_resume_suspended(c); } -/* If neither socket becomes readable in the specified timeout, - * this callback will kill the request. - * We do not have to worry about having a cancel and a IO both queued. - */ -static void proxy_http_async_cancel_cb(void *baton) -{ - proxy_http_req_t *req = (proxy_http_req_t *)baton; - - ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, req->r, - "proxy %s: cancel async", req->proto); - - req->r->connection->keepalive = AP_CONN_CLOSE; - req->backend->close = 1; - proxy_http_async_finish(req); -} +/* Tell the MPM to poll the connections and resume when ready */ +static void proxy_http_async_poll(proxy_http_req_t *req) +{ + conn_rec *c = req->r->connection; + proxy_tunnel_rec *tunnel = req->tunnel; -/* Invoked by the event loop when data is ready on either end. - * We don't need the invoke_mtx, since we never put multiple callback events - * in the queue. - */ -static void proxy_http_async_cb(void *baton) -{ - proxy_http_req_t *req = (proxy_http_req_t *)baton; - int status; + ap_log_rerror(APLOG_MARK, APLOG_TRACE5, 0, req->r, + "proxy %s: going async", req->proto); + /* Create/clear the subpool used by the MPM to allocate + * the temporary data needed for this polling. + */ if (req->async_pool) { - /* Clear MPM's temporary data */ apr_pool_clear(req->async_pool); } + else { + apr_pool_create(&req->async_pool, req->p); + } - switch (req->state) { - case PROXY_HTTP_TUNNELING: - /* Pump both ends until they'd block and then start over again */ - status = ap_proxy_tunnel_run(req->tunnel); - if (status == HTTP_GATEWAY_TIME_OUT) { - status = SUSPENDED; - } - break; + ap_mpm_poll_suspended(c, req->async_pool, tunnel->pfds, req->idle_timeout); +} - default: - ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, req->r, - "proxy %s: unexpected async state (%i)", - req->proto, (int)req->state); - status = HTTP_INTERNAL_SERVER_ERROR; - break; - } +/* The resume_connection hook called by the MPM when async polling completes (or times out) */ +static void proxy_http_resume_connection(conn_rec *c, request_rec *r) +{ + proxy_http_req_t *req = NULL; + int status; - if (status == SUSPENDED) { - ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, req->r, - "proxy %s: suspended, going async", - req->proto); - - if (!req->async_pool) { - /* Create the subpool used by the MPM to alloc its own - * temporary data, which we want to clear on the next - * round (above) to avoid leaks. - */ - apr_pool_create(&req->async_pool, req->p); - } + if (r) { + req = ap_get_module_config(r->request_config, &proxy_http_module); + } + if (!req || !req->suspended) { + return; + } + ap_assert(req->r == r); - ap_mpm_register_poll_callback_timeout(req->async_pool, - req->tunnel->pfds, - proxy_http_async_cb, - proxy_http_async_cancel_cb, - req, req->idle_timeout); + if (c->cs->state == CONN_STATE_SUSPENDED) { + status = proxy_http_tunnel_pump(req); + } + else { + AP_DEBUG_ASSERT(c->cs->state == CONN_STATE_LINGER); + ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO() + "proxy: %s async tunneling timed out (state %i)", + req->proto, c->cs->state); + status = DONE; } - else if (ap_is_HTTP_ERROR(status)) { - proxy_http_async_cancel_cb(req); + if (status == SUSPENDED) { + /* Keep polling in the MPM */ + proxy_http_async_poll(req); } else { - proxy_http_async_finish(req); + /* Done with tunneling */ + proxy_http_async_done(req, status != OK); + } +} + +/* The suspend_connection hook called once the MPM gets the SUSPENDED connection */ +static void proxy_http_suspend_connection(conn_rec *c, request_rec *r) +{ + proxy_http_req_t *req = NULL; + + if (r) { + req = ap_get_module_config(r->request_config, &proxy_http_module); } + if (!req || !req->suspended) { + return; + } + ap_assert(req->r == r); + + proxy_http_async_poll(req); } static int stream_reqbody(proxy_http_req_t *req) @@ -1553,22 +1589,40 @@ int ap_proxy_http_process_response(proxy_http_req_t *req) "can't create tunnel for %s", upgrade); return HTTP_INTERNAL_SERVER_ERROR; } + if (req->can_suspend) { + /* If the MPM allows async polling, this thread will tunnel + * all it can now so long as it's not timeouting on the (short) + * async delay, returning to the MPM otherwise to get scheduled + * again when the connections are ready. + */ + req->tunnel->timeout = dconf->async_delay; + } + else { + /* If the MPM doesn't allow async polling, the full tunneling + * happens now in this thread and timeouting is a showstopper. + */ + req->tunnel->timeout = req->idle_timeout; + } r->status = HTTP_SWITCHING_PROTOCOLS; req->proto = upgrade; - - if (req->can_go_async) { - /* Let the MPM schedule the work when idle */ - req->state = PROXY_HTTP_TUNNELING; - req->tunnel->timeout = dconf->async_delay; - proxy_http_async_cb(req); + req->upgraded = 1; + + status = proxy_http_tunnel_pump(req); + if (status == SUSPENDED) { + /* Let the MPM call proxy_http_suspend_connection() when + * the connection is returned to it (i.e. not handled anywhere + * else anymore). This prevents the connection from being seen + * or handled by multiple threads at the same time, which could + * happen if we'd call ap_mpm_poll_suspended() directly from + * here, during the time for the connection to actually reaches + * the MPM whilst a new IO causes the connection to be + * rescheduled quickly. + */ + req->suspended = 1; return SUSPENDED; } - /* Let proxy tunnel forward everything within this thread */ - req->tunnel->timeout = req->idle_timeout; - status = ap_proxy_tunnel_run(req->tunnel); - /* Report bytes exchanged by the backend */ backend->worker->s->read += ap_proxy_tunnel_conn_bytes_in(req->tunnel->origin); @@ -1932,7 +1986,6 @@ static int proxy_http_handler(request_rec *r, proxy_worker *worker, proxy_http_req_t *req = NULL; proxy_conn_rec *backend = NULL; apr_bucket_brigade *input_brigade = NULL; - int mpm_can_poll = 0; int is_ssl = 0; conn_rec *c = r->connection; proxy_dir_conf *dconf; @@ -1972,7 +2025,6 @@ static int proxy_http_handler(request_rec *r, proxy_worker *worker, backend->is_ssl = is_ssl; dconf = ap_get_module_config(r->per_dir_config, &proxy_module); - ap_mpm_query(AP_MPMQ_CAN_POLL, &mpm_can_poll); req = apr_pcalloc(p, sizeof(*req)); req->p = p; @@ -1983,12 +2035,13 @@ static int proxy_http_handler(request_rec *r, proxy_worker *worker, req->backend = backend; req->proto = scheme; req->bucket_alloc = c->bucket_alloc; - req->can_go_async = (mpm_can_poll && - dconf->async_delay_set && - dconf->async_delay >= 0); - req->state = PROXY_HTTP_REQ_HAVE_HEADER; + req->can_suspend = (mpm_can_poll_suspended && + dconf->async_delay_set && + dconf->async_delay >= 0); req->rb_method = RB_INIT; + ap_set_module_config(r->request_config, &proxy_http_module, req); + if (apr_table_get(r->subprocess_env, "force-proxy-request-1.0")) { req->force10 = 1; } @@ -2004,9 +2057,9 @@ static int proxy_http_handler(request_rec *r, proxy_worker *worker, } } - if (req->can_go_async || req->upgrade) { + if (req->can_suspend || req->upgrade) { /* If ProxyAsyncIdleTimeout is not set, use backend timeout */ - if (req->can_go_async && dconf->async_idle_timeout_set) { + if (req->can_suspend && dconf->async_idle_timeout_set) { req->idle_timeout = dconf->async_idle_timeout; } else if (worker->s->timeout_set) { @@ -2045,7 +2098,7 @@ static int proxy_http_handler(request_rec *r, proxy_worker *worker, * data to the backend ASAP? */ if (input_brigade - || req->can_go_async + || req->can_suspend || req->do_100_continue || apr_table_get(r->subprocess_env, "proxy-prefetch-nonblocking")) { @@ -2190,13 +2243,18 @@ static int proxy_http_handler(request_rec *r, proxy_worker *worker, static int proxy_http_post_config(apr_pool_t *pconf, apr_pool_t *plog, apr_pool_t *ptemp, server_rec *s) { - /* proxy_http_post_config() will be called twice during startup. So, don't * set up the static data the 1st time through. */ if (ap_state_query(AP_SQ_MAIN_STATE) == AP_SQ_MS_CREATE_PRE_CONFIG) { return OK; } +#ifdef AP_MPMQ_CAN_POLL_SUSPENDED + if (ap_mpm_query(AP_MPMQ_CAN_POLL_SUSPENDED, &mpm_can_poll_suspended)) { + mpm_can_poll_suspended = 0; + } +#endif + ap_proxy_clear_connection_fn = APR_RETRIEVE_OPTIONAL_FN(ap_proxy_clear_connection); if (!ap_proxy_clear_connection_fn) { @@ -2214,6 +2272,10 @@ static void ap_proxy_http_register_hook(apr_pool_t *p) proxy_hook_scheme_handler(proxy_http_handler, NULL, NULL, APR_HOOK_FIRST); proxy_hook_canon_handler(proxy_http_canon, NULL, NULL, APR_HOOK_FIRST); warn_rx = ap_pregcomp(p, "[0-9]{3}[ \t]+[^ \t]+[ \t]+\"[^\"]*\"([ \t]+\"([^\"]+)\")?", 0); + + /* For when the tunnel connections are suspended to and resumed from the MPM */ + ap_hook_suspend_connection(proxy_http_suspend_connection, NULL, NULL, APR_HOOK_FIRST); + ap_hook_resume_connection(proxy_http_resume_connection, NULL, NULL, APR_HOOK_FIRST); } AP_DECLARE_MODULE(proxy_http) = { diff --git a/modules/proxy/mod_proxy_wstunnel.c b/modules/proxy/mod_proxy_wstunnel.c index 0e5e6cb8128..3439b08b18d 100644 --- a/modules/proxy/mod_proxy_wstunnel.c +++ b/modules/proxy/mod_proxy_wstunnel.c @@ -17,13 +17,15 @@ #include "mod_proxy.h" #include "http_config.h" #include "ap_mpm.h" +#include "mpm_common.h" module AP_MODULE_DECLARE_DATA proxy_wstunnel_module; +static int mpm_can_poll_suspended = 0; + typedef struct { unsigned int fallback_to_proxy_http :1, fallback_to_proxy_http_set :1; - int mpm_can_poll; apr_time_t idle_timeout; apr_time_t async_delay; } proxyws_dir_conf; @@ -32,83 +34,130 @@ typedef struct ws_baton_t { request_rec *r; proxy_conn_rec *backend; proxy_tunnel_rec *tunnel; + apr_time_t idle_timeout; apr_pool_t *async_pool; const char *scheme; + int suspended; } ws_baton_t; static int can_fallback_to_proxy_http; -static void proxy_wstunnel_callback(void *b); - -static int proxy_wstunnel_pump(ws_baton_t *baton, int async) +static int proxy_wstunnel_pump(ws_baton_t *baton) { int status = ap_proxy_tunnel_run(baton->tunnel); if (status == HTTP_GATEWAY_TIME_OUT) { - if (!async) { + if (!mpm_can_poll_suspended) { /* ap_proxy_tunnel_run() didn't log this */ ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, baton->r, APLOGNO(10225) - "Tunnel timed out"); + "proxy: %s tunneling timed out", + baton->scheme); } else { - ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, baton->r, APLOGNO(02542) - "Attempting to go async"); status = SUSPENDED; } } return status; } -static void proxy_wstunnel_finish(ws_baton_t *baton) -{ - ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, baton->r, "proxy_wstunnel_finish"); - ap_proxy_release_connection(baton->scheme, baton->backend, baton->r->server); - ap_finalize_request_protocol(baton->r); - ap_lingering_close(baton->r->connection); - ap_mpm_resume_suspended(baton->r->connection); - ap_process_request_after_handler(baton->r); /* don't touch baton or r after here */ +/* The backend and SUSPENDED client connections are done, + * release them (the latter in the MPM). + */ +static void proxy_wstunnel_done(ws_baton_t *baton, int cancelled) +{ + request_rec *r = baton->r; + conn_rec *c = r->connection; + proxy_conn_rec *backend = baton->backend; + + ap_log_rerror(APLOG_MARK, APLOG_TRACE5, 0, r, "proxy %s: %s async", + baton->scheme, cancelled ? "cancel" : "finish"); + + /* Upgraded connections not reusable */ + c->keepalive = AP_CONN_CLOSE; + backend->close = 1; + + ap_proxy_release_connection(baton->scheme, backend, r->server); + + ap_finalize_request_protocol(r); + ap_process_request_after_handler(r); + /* don't dereference baton or r from here! */ + + /* Return the client connection to the MPM */ + c->cs->state = CONN_STATE_LINGER; + ap_mpm_resume_suspended(c); } -/* If neither socket becomes readable in the specified timeout, - * this callback will kill the request. We do not have to worry about - * having a cancel and a IO both queued. - */ -static void proxy_wstunnel_cancel_callback(void *b) -{ - ws_baton_t *baton = (ws_baton_t*)b; - ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, baton->r, - "proxy_wstunnel_cancel_callback, IO timed out"); - proxy_wstunnel_finish(baton); +/* Tell the MPM to poll the connections and resume when ready */ +static void proxy_wstunnel_poll(ws_baton_t *baton) +{ + request_rec *r = baton->r; + conn_rec *c = r->connection; + + ap_log_rerror(APLOG_MARK, APLOG_TRACE5, 0, r, + "proxy %s: going async", baton->scheme); + + /* Create/clear the subpool used by the MPM to allocate + * the temporary data needed for this polling. + */ + if (baton->async_pool) { + apr_pool_clear(baton->async_pool); + } + else { + apr_pool_create(&baton->async_pool, r->pool); + } + + c->cs->state = CONN_STATE_SUSPENDED; + ap_mpm_poll_suspended(c, baton->async_pool, baton->tunnel->pfds, + baton->idle_timeout); } -/* Invoked by the event loop when data is ready on either end. - * Pump both ends until they'd block and then start over again - * We don't need the invoke_mtx, since we never put multiple callback events - * in the queue. - */ -static void proxy_wstunnel_callback(void *b) -{ - ws_baton_t *baton = (ws_baton_t*)b; +/* The resume_connection hook called by the MPM when polling completes (or times out) */ +static void proxy_wstunnel_resume_connection(conn_rec *c, request_rec *r) +{ + ws_baton_t *baton = NULL; + int status; - /* Clear MPM's temporary data */ - AP_DEBUG_ASSERT(baton->async_pool != NULL); - apr_pool_clear(baton->async_pool); + if (r) { + baton = ap_get_module_config(r->request_config, &proxy_wstunnel_module); + } + if (!baton || !baton->suspended) { + return; + } + ap_assert(baton->r == r); - if (proxy_wstunnel_pump(baton, 1) == SUSPENDED) { - proxyws_dir_conf *dconf = ap_get_module_config(baton->r->per_dir_config, - &proxy_wstunnel_module); + if (c->cs->state == CONN_STATE_SUSPENDED) { + status = proxy_wstunnel_pump(baton); + } + else { + AP_DEBUG_ASSERT(c->cs->state == CONN_STATE_LINGER); + ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO() + "proxy: %s async tunneling timed out (state %i)", + baton->scheme, c->cs->state); + status = DONE; + } + if (status == SUSPENDED) { + /* Keep polling in the MPM */ + proxy_wstunnel_poll(baton); + } + else { + /* Done with tunneling */ + proxy_wstunnel_done(baton, status != OK); + } +} - ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, baton->r, - "proxy_wstunnel_callback suspend"); +/* The suspend_connection hook called once the MPM gets the SUSPENDED connection */ +static void proxy_wstunnel_suspend_connection(conn_rec *c, request_rec *r) +{ + ws_baton_t *baton = NULL; - ap_mpm_register_poll_callback_timeout(baton->async_pool, - baton->tunnel->pfds, - proxy_wstunnel_callback, - proxy_wstunnel_cancel_callback, - baton, dconf->idle_timeout); + if (r) { + baton = ap_get_module_config(r->request_config, &proxy_wstunnel_module); } - else { - proxy_wstunnel_finish(baton); + if (!baton || !baton->suspended) { + return; } + ap_assert(baton->r == r); + + proxy_wstunnel_poll(baton); } static int proxy_wstunnel_check_trans(request_rec *r, const char *url) @@ -296,51 +345,35 @@ static int proxy_wstunnel_request(apr_pool_t *p, request_rec *r, "error creating websocket tunnel"); return HTTP_INTERNAL_SERVER_ERROR; } + if (mpm_can_poll_suspended) { + tunnel->timeout = dconf->async_delay; + } + else { + tunnel->timeout = dconf->idle_timeout; + } baton = apr_pcalloc(r->pool, sizeof(*baton)); baton->r = r; baton->backend = conn; baton->tunnel = tunnel; baton->scheme = scheme; - - if (!dconf->mpm_can_poll) { - tunnel->timeout = dconf->idle_timeout; - status = proxy_wstunnel_pump(baton, 0); - } - else { - tunnel->timeout = dconf->async_delay; - status = proxy_wstunnel_pump(baton, 1); - if (status == SUSPENDED) { - /* Create the subpool used by the MPM to alloc its own - * temporary data, which we want to clear on the next - * round (above) to avoid leaks. - */ - apr_pool_create(&baton->async_pool, baton->r->pool); - - rv = ap_mpm_register_poll_callback_timeout( - baton->async_pool, - baton->tunnel->pfds, - proxy_wstunnel_callback, - proxy_wstunnel_cancel_callback, - baton, - dconf->idle_timeout); - if (rv == APR_SUCCESS) { - return SUSPENDED; - } - - if (APR_STATUS_IS_ENOTIMPL(rv)) { - ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, r, APLOGNO(02544) "No async support"); - tunnel->timeout = dconf->idle_timeout; - status = proxy_wstunnel_pump(baton, 0); /* force no async */ - } - else { - ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, r, APLOGNO(10211) - "error registering websocket tunnel"); - status = HTTP_INTERNAL_SERVER_ERROR; - } - } + baton->idle_timeout = dconf->idle_timeout; + ap_set_module_config(r->request_config, &proxy_wstunnel_module, baton); + + status = proxy_wstunnel_pump(baton); + if (status == SUSPENDED) { + /* Let the MPM call proxy_wstunnel_suspend_connection() when + * the connection is returned to it (i.e. not handled anywhere + * else anymore). This prevents the connection from being seen + * or handled by multiple threads at the same time, which could + * happen if we'd call ap_mpm_poll_suspended() directly from + * here, during the time for the connection to actually reaches + * the MPM whilst a new IO causes the connection to be + * rescheduled quickly. + */ + baton->suspended = 1; + return SUSPENDED; } - if (ap_is_HTTP_ERROR(status)) { /* Don't send an error page down an upgraded connection */ if (!tunnel->replied) { @@ -462,8 +495,6 @@ static void *create_proxyws_dir_config(apr_pool_t *p, char *dummy) new->fallback_to_proxy_http = 1; new->idle_timeout = -1; /* no timeout */ - ap_mpm_query(AP_MPMQ_CAN_POLL, &new->mpm_can_poll); - return (void *) new; } @@ -477,7 +508,6 @@ static void *merge_proxyws_dir_config(apr_pool_t *p, void *vbase, void *vadd) : base->fallback_to_proxy_http; new->fallback_to_proxy_http_set = (add->fallback_to_proxy_http_set || base->fallback_to_proxy_http_set); - new->mpm_can_poll = add->mpm_can_poll; new->idle_timeout = add->idle_timeout; new->async_delay = add->async_delay; @@ -514,6 +544,12 @@ static int proxy_wstunnel_post_config(apr_pool_t *pconf, apr_pool_t *plog, can_fallback_to_proxy_http = (ap_find_linked_module("mod_proxy_http.c") != NULL); +#ifdef AP_MPMQ_CAN_POLL_SUSPENDED + if (ap_mpm_query(AP_MPMQ_CAN_POLL_SUSPENDED, &mpm_can_poll_suspended)) { + mpm_can_poll_suspended = 0; + } +#endif + return OK; } @@ -542,6 +578,10 @@ static void ws_proxy_hooks(apr_pool_t *p) proxy_hook_scheme_handler(proxy_wstunnel_handler, NULL, aszSucc, APR_HOOK_FIRST); proxy_hook_check_trans(proxy_wstunnel_check_trans, NULL, aszSucc, APR_HOOK_MIDDLE); proxy_hook_canon_handler(proxy_wstunnel_canon, NULL, aszSucc, APR_HOOK_FIRST); + + /* For when the tunnel connections are suspended to and resumed from the MPM */ + ap_hook_suspend_connection(proxy_wstunnel_suspend_connection, NULL, NULL, APR_HOOK_FIRST); + ap_hook_resume_connection(proxy_wstunnel_resume_connection, NULL, NULL, APR_HOOK_FIRST); } AP_DECLARE_MODULE(proxy_wstunnel) = { diff --git a/modules/proxy/proxy_util.c b/modules/proxy/proxy_util.c index 88d174220d8..52595a03ec5 100644 --- a/modules/proxy/proxy_util.c +++ b/modules/proxy/proxy_util.c @@ -5898,7 +5898,7 @@ PROXY_DECLARE(int) ap_proxy_tunnel_run(proxy_tunnel_rec *tunnel) ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(10221) "proxy: %s: %s flushing failed (%i)", scheme, out->name, rc); - status = rc; + status = HTTP_BAD_GATEWAY; goto done; } diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index e0ba249bbf7..1a71f214c8c 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -246,6 +246,8 @@ typedef struct event_srv_cfg_s event_srv_cfg; struct timeout_queue; static apr_thread_mutex_t *timeout_mutex; +struct user_poll_baton; + /* * The pollset for sockets that are in any of the timeout queues. Currently * we use the timeout_mutex to make sure that connections are added/removed @@ -297,6 +299,8 @@ struct event_conn_state_t { struct timeout_queue *q; /** the timer event for this entry */ timer_event_t *te; + /** user pollfds (for suspended connection) */ + struct user_poll_baton *user_baton; /* * when queued to workers @@ -317,6 +321,8 @@ struct event_conn_state_t { * hooks) */ suspended :1, + /** Did the connection timed out? */ + timed_out :1, /** Is lingering close from defer_lingering_close()? */ deferred_linger :1, /** Has ap_start_lingering_close() been called? */ @@ -497,6 +503,15 @@ static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *cs) apr_time_t elem_expiry; apr_time_t next_expiry; + /* It greatly simplifies the logic to use a single timeout value per q + * because the new element can just be added to the end of the list and + * it will stay sorted in expiration time sequence. If brand new + * sockets are sent to the event thread for a readability check, this + * will be a slight behavior change - they use the non-keepalive + * timeout today. With a normal client, the socket will be readable in + * a few milliseconds anyway. + */ + ap_assert(q && !cs->q); cs->q = q; @@ -619,14 +634,14 @@ typedef struct void *baton; } listener_poll_type; -typedef struct socket_callback_baton -{ - ap_mpm_callback_fn_t *cbfunc; - void *user_baton; +struct user_poll_baton { + apr_pool_t *pool; + event_conn_state_t *cs; apr_array_header_t *pfds; + apr_thread_mutex_t *mutex; /* pfds added/removed atomically */ timer_event_t *cancel_event; /* If a timeout was requested, a pointer to the timer event */ - struct socket_callback_baton *next; -} socket_callback_baton_t; + struct user_poll_baton *next; /* chaining */ +}; typedef struct event_child_bucket { ap_pod_t *pod; @@ -1120,6 +1135,9 @@ static int event_query(int query_code, int *result, apr_status_t *rv) case AP_MPMQ_CAN_WAITIO: *result = 1; break; + case AP_MPMQ_CAN_POLL_SUSPENDED: + *result = 1; + break; default: *rv = APR_ENOTIMPL; break; @@ -1223,11 +1241,8 @@ static apr_status_t decrement_connection_count(void *cs_) "connection %" CS_FMT_TO " cleaned up", CS_ARG_TO(cs)); - switch (cs->pub.state) { - case CONN_STATE_SUSPENDED: + if (cs->suspended) { apr_atomic_dec32(&suspended_count); - default: - break; } /* Unblock the listener if it's waiting for connection_count = 0, @@ -1250,15 +1265,24 @@ static apr_status_t decrement_connection_count(void *cs_) static void notify_suspend(event_conn_state_t *cs) { - ap_run_suspend_connection(cs->c, cs->r); - cs->c->sbh = NULL; + AP_DEBUG_ASSERT(!cs->suspended); + ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, + "Suspend connection %" CS_FMT, CS_ARG(cs)); + apr_atomic_inc32(&suspended_count); cs->suspended = 1; + + cs->c->sbh = NULL; + cs->c->suspended_baton = cs; + ap_run_suspend_connection(cs->c, cs->r); } -static void notify_resume(event_conn_state_t *cs, int cleanup) +static void notify_resume(event_conn_state_t *cs) { - cs->suspended = 0; - cs->c->sbh = cleanup ? NULL : cs->sbh; + AP_DEBUG_ASSERT(cs->suspended); + ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, + "Resume connection %" CS_FMT, CS_ARG(cs)); + + cs->c->sbh = cs->sbh; ap_run_resume_connection(cs->c, cs->r); } @@ -1360,12 +1384,13 @@ static void shutdown_connection(event_conn_state_t *cs, apr_time_t now, * if the connection is currently suspended as far as modules * know, provide notification of resumption. */ -static apr_status_t ptrans_pre_cleanup(void *dummy) +static apr_status_t ptrans_pre_cleanup(void *arg) { - event_conn_state_t *cs = dummy; - + event_conn_state_t *cs = arg; if (cs->suspended) { - notify_resume(cs, 1); + cs->sbh = NULL; + cs->pub.state = CONN_STATE_LINGER; + notify_resume(cs); } return APR_SUCCESS; } @@ -1440,7 +1465,8 @@ static int pollset_add_at(event_conn_state_t *cs, int sense, (int)cs->pfd.reqevents, CS_ARG(cs), at, line); - ap_assert(cs->q == NULL && cs->te == NULL && ((q != NULL) ^ (te != NULL))); + ap_assert((q != NULL) ^ (te != NULL)); + ap_assert(cs->q == NULL && cs->te == NULL); set_conn_state_sense(cs, sense); @@ -1497,8 +1523,6 @@ static int pollset_del_at(event_conn_state_t *cs, int locked, (int)cs->pfd.reqevents, CS_ARG(cs), at, line); - ap_assert((cs->q != NULL) ^ (cs->te != NULL)); - if (cs->q) { if (!locked) { apr_thread_mutex_lock(timeout_mutex); @@ -1508,7 +1532,7 @@ static int pollset_del_at(event_conn_state_t *cs, int locked, apr_thread_mutex_unlock(timeout_mutex); } } - else { + else if (cs->te) { cs->te->canceled = 1; cs->te = NULL; } @@ -1537,8 +1561,7 @@ static int pollset_del_at(event_conn_state_t *cs, int locked, /* Forward declare */ static timer_event_t *get_timer_event(apr_time_t timeout, ap_mpm_callback_fn_t *cbfn, void *baton, - int insert, - apr_array_header_t *pfds); + int insert); static void process_lingering_close(event_conn_state_t *cs); static event_conn_state_t *make_conn_state(apr_pool_t *p, apr_socket_t *csd) @@ -1640,22 +1663,28 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, close_connection(cs); return; } - - cs->pub.sense = CONN_SENSE_DEFAULT; } else { /* The connection is scheduled back */ c = cs->c; c->current_thread = thd; c->id = conn_id; /* thread number is part of ID */ ap_update_sb_handle(cs->sbh, my_child_num, my_thread_num); - notify_resume(cs, 0); + } + + /* Suspended connections hooks run here and don't fall through */ + if (cs->suspended) { + ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, + "resuming connection %" CS_FMT, CS_ARG(cs)); + notify_resume(cs); + return; } ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, "processing connection %" CS_FMT " (aborted %d, clogging %d)", CS_ARG(cs), c->aborted, c->clogging_input_filters); - if (cs->pub.state == CONN_STATE_LINGER) { + if (cs->pub.state == CONN_STATE_LINGER || c->aborted) { + cs->pub.state = CONN_STATE_LINGER; goto lingering_close; } @@ -1697,16 +1726,15 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, * worker or prefork MPMs for instance. */ switch (rc) { - case DONE: - rc = OK; /* same as OK, fall through */ case OK: + case DONE: /* same as OK, fall through */ if (cs->pub.state == CONN_STATE_PROCESSING) { cs->pub.state = CONN_STATE_LINGER; } else if (cs->pub.state == CONN_STATE_KEEPALIVE) { cs->pub.state = CONN_STATE_WRITE_COMPLETION; } - break; + rc = OK; } if (rc != OK || (cs->pub.state != CONN_STATE_LINGER && cs->pub.state != CONN_STATE_ASYNC_WAITIO @@ -1735,7 +1763,6 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, * event thread poll for read/writeability. */ ap_update_child_status(cs->sbh, SERVER_BUSY_READ, NULL); - notify_suspend(cs); /* If the connection timeout is actually different than the waitio_q's, * use a timer event to honor it (e.g. mod_reqtimeout may enforce its @@ -1747,7 +1774,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, if (timeout < TIMERS_FUDGE_TIMEOUT) { timeout = TIMERS_FUDGE_TIMEOUT; } - te = get_timer_event(timeout, NULL, cs, 1, NULL); + te = get_timer_event(timeout, NULL, cs, 1); } else { q = cs->sc->io_q; @@ -1776,7 +1803,6 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, } if (pending == AGAIN) { /* Let the event thread poll for write */ - notify_suspend(cs); cs->pub.sense = CONN_SENSE_DEFAULT; if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q, NULL)) { return; /* queued */ @@ -1804,16 +1830,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, if (cs->pub.state == CONN_STATE_KEEPALIVE) { ap_update_child_status(cs->sbh, SERVER_BUSY_KEEPALIVE, NULL); - /* It greatly simplifies the logic to use a single timeout value per q - * because the new element can just be added to the end of the list and - * it will stay sorted in expiration time sequence. If brand new - * sockets are sent to the event thread for a readability check, this - * will be a slight behavior change - they use the non-keepalive - * timeout today. With a normal client, the socket will be readable in - * a few milliseconds anyway. - */ - notify_suspend(cs); - + cs->pub.sense = CONN_SENSE_DEFAULT; if (!pollset_add(cs, CONN_SENSE_WANT_READ, cs->ka_sc->ka_q, NULL)) { cs->pub.state = CONN_STATE_LINGER; goto lingering_close; @@ -1823,33 +1840,149 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p, } if (cs->pub.state == CONN_STATE_SUSPENDED) { - cs->c->suspended_baton = cs; - apr_atomic_inc32(&suspended_count); notify_suspend(cs); - return; /* done */ + return; /* suspended */ } lingering_close: process_lingering_close(cs); } +static apr_status_t user_poll_cleanup(void *data) +{ + struct user_poll_baton *user_baton = data; + apr_array_header_t *pfds = user_baton->pfds; + apr_status_t rc, final_rc = APR_SUCCESS; + int i; + + /* All the pollfds should be added/removed atomically, so synchronize + * with register_user_poll(). + */ + apr_thread_mutex_lock(user_baton->mutex); + for (i = 0; i < pfds->nelts; i++) { + apr_pollfd_t *pfd = (apr_pollfd_t *)pfds->elts + i; + if (pfd->client_data) { + rc = apr_pollset_remove(event_pollset, pfd); + if (rc != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rc)) { + final_rc = rc; + } + pfd->client_data = NULL; + } + } + apr_thread_mutex_unlock(user_baton->mutex); + + if (final_rc) { + AP_DEBUG_ASSERT(0); + signal_threads(ST_GRACEFUL); + } + return final_rc; +} + +/* Put some user pollfds into the listener pollset for a SUSPENDED connection */ +static apr_status_t event_poll_suspended(conn_rec *c, apr_pool_t *p, + const apr_array_header_t *user_pfds, + apr_interval_time_t timeout) +{ + event_conn_state_t *cs = c->suspended_baton; + apr_status_t rc, final_rc = APR_SUCCESS; + struct user_poll_baton *user_baton; + apr_array_header_t *pfds; + listener_poll_type *pt; + int i; + + AP_DEBUG_ASSERT(cs != NULL); + AP_DEBUG_ASSERT(cs->suspended); + AP_DEBUG_ASSERT(user_pfds->nelts > 0); + if (cs == NULL) { + ap_log_cerror (APLOG_MARK, LOG_WARNING, 0, c, APLOGNO() + "event_poll_suspended: suspended_baton is NULL"); + return APR_EINVAL; + } + if (!cs->suspended) { + ap_log_cerror (APLOG_MARK, LOG_WARNING, 0, c, APLOGNO() + "event_poll_suspended: thread isn't suspended"); + return APR_EINVAL; + } + if (user_pfds->nelts <= 0) { + ap_log_cerror (APLOG_MARK, LOG_WARNING, 0, c, APLOGNO() + "event_poll_suspended: no poll FDs"); + return APR_EINVAL; + } + + cs->pub.state = CONN_STATE_SUSPENDED; + cs->user_baton = user_baton = apr_pcalloc(p, sizeof(*user_baton)); + apr_thread_mutex_create(&user_baton->mutex, APR_THREAD_MUTEX_DEFAULT, p); + user_baton->pfds = pfds = apr_array_copy(p, user_pfds); + user_baton->pool = p; + user_baton->cs = cs; + + apr_pool_pre_cleanup_register(p, user_baton, user_poll_cleanup); + + pt = apr_pcalloc(p, sizeof(*pt)); + pt->baton = user_baton; + pt->type = PT_USER; + + if (timeout >= 0) { + /* Prevent the timer from firing before the pollset is updated */ + if (timeout < TIMERS_FUDGE_TIMEOUT) { + timeout = TIMERS_FUDGE_TIMEOUT; + } + user_baton->cancel_event = get_timer_event(timeout, NULL, cs, 1); + } + cs->te = user_baton->cancel_event; + + /* All the pollfds should be added/removed atomically, so synchronize + * with user_poll_cleanup(). + */ + apr_thread_mutex_lock(user_baton->mutex); + for (i = 0; i < pfds->nelts; i++) { + apr_pollfd_t *pfd = (apr_pollfd_t *)pfds->elts + i; + if (pfd->reqevents) { + if (pfd->reqevents & APR_POLLIN) { + pfd->reqevents |= APR_POLLHUP; + } + pfd->reqevents |= APR_POLLERR; + pfd->client_data = pt; + + rc = apr_pollset_add(event_pollset, pfd); + if (rc != APR_SUCCESS) { + final_rc = rc; + } + } + else { + pfd->client_data = NULL; + } + } + apr_thread_mutex_unlock(user_baton->mutex); + + if (final_rc) { + AP_DEBUG_ASSERT(0); + signal_threads(ST_GRACEFUL); + } + return final_rc; +} + /* Put a SUSPENDED connection back into a queue. */ -static apr_status_t event_resume_suspended (conn_rec *c) +static apr_status_t event_resume_suspended(conn_rec *c) { - event_conn_state_t* cs = (event_conn_state_t*) c->suspended_baton; + event_conn_state_t *cs = c->suspended_baton; + + AP_DEBUG_ASSERT(cs != NULL); + AP_DEBUG_ASSERT(cs->suspended); if (cs == NULL) { ap_log_cerror (APLOG_MARK, LOG_WARNING, 0, c, APLOGNO(02615) "event_resume_suspended: suspended_baton is NULL"); - return APR_EGENERAL; + return APR_EINVAL; } if (!cs->suspended) { ap_log_cerror (APLOG_MARK, LOG_WARNING, 0, c, APLOGNO(02616) - "event_resume_suspended: Thread isn't suspended"); - return APR_EGENERAL; + "event_resume_suspended: thread isn't suspended"); + return APR_EINVAL; } - apr_atomic_dec32(&suspended_count); - c->suspended_baton = NULL; + cs->c->suspended_baton = NULL; + cs->c->sbh = cs->sbh; + cs->suspended = 0; cs->pub.sense = CONN_SENSE_DEFAULT; if (cs->pub.state != CONN_STATE_LINGER) { @@ -1857,7 +1990,6 @@ static apr_status_t event_resume_suspended (conn_rec *c) if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q, NULL)) { return APR_SUCCESS; /* queued */ } - /* fall through lingering close on error */ cs->pub.state = CONN_STATE_LINGER; } @@ -2150,8 +2282,7 @@ static apr_thread_mutex_t *g_timer_skiplist_mtx; static timer_event_t *get_timer_event(apr_time_t timeout, ap_mpm_callback_fn_t *cbfn, void *baton, - int insert, - apr_array_header_t *pfds) + int insert) { timer_event_t *te; apr_time_t now = (timeout < 0) ? 0 : event_time_now(); @@ -2179,7 +2310,6 @@ static timer_event_t *get_timer_event(apr_time_t timeout, te->baton = baton; te->when = now + timeout; te->timeout = timeout; - te->pfds = pfds; if (insert) { apr_time_t next_expiry; @@ -2219,122 +2349,15 @@ static void put_timer_event(timer_event_t *te, int locked) } } -static apr_status_t event_register_timed_callback_ex(apr_time_t timeout, - ap_mpm_callback_fn_t *cbfn, - void *baton, - apr_array_header_t *pfds) -{ - if (!cbfn) { - return APR_EINVAL; - } - get_timer_event(timeout, cbfn, baton, 1, pfds); - return APR_SUCCESS; -} - static apr_status_t event_register_timed_callback(apr_time_t timeout, ap_mpm_callback_fn_t *cbfn, void *baton) { - event_register_timed_callback_ex(timeout, cbfn, baton, NULL); - return APR_SUCCESS; -} - -static apr_status_t event_cleanup_poll_callback(void *data) -{ - apr_status_t final_rc = APR_SUCCESS; - apr_array_header_t *pfds = data; - int i; - - for (i = 0; i < pfds->nelts; i++) { - apr_pollfd_t *pfd = (apr_pollfd_t *)pfds->elts + i; - if (pfd->client_data) { - apr_status_t rc; - rc = apr_pollset_remove(event_pollset, pfd); - if (rc != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rc)) { - final_rc = rc; - } - pfd->client_data = NULL; - } - } - - if (final_rc) { - AP_DEBUG_ASSERT(0); - signal_threads(ST_GRACEFUL); - } - return final_rc; -} - -static apr_status_t event_register_poll_callback_ex(apr_pool_t *p, - const apr_array_header_t *pfds, - ap_mpm_callback_fn_t *cbfn, - ap_mpm_callback_fn_t *tofn, - void *baton, - apr_time_t timeout) -{ - listener_poll_type *pt; - socket_callback_baton_t *scb; - apr_status_t rc, final_rc = APR_SUCCESS; - int i; - - if (!cbfn || !tofn) { + if (!cbfn) { return APR_EINVAL; } - - scb = apr_pcalloc(p, sizeof(*scb)); - scb->cbfunc = cbfn; - scb->user_baton = baton; - scb->pfds = apr_array_copy(p, pfds); - - pt = apr_palloc(p, sizeof(*pt)); - pt->type = PT_USER; - pt->baton = scb; - - apr_pool_pre_cleanup_register(p, scb->pfds, event_cleanup_poll_callback); - - for (i = 0; i < scb->pfds->nelts; i++) { - apr_pollfd_t *pfd = (apr_pollfd_t *)scb->pfds->elts + i; - if (pfd->reqevents) { - if (pfd->reqevents & APR_POLLIN) { - pfd->reqevents |= APR_POLLHUP; - } - pfd->reqevents |= APR_POLLERR; - pfd->client_data = pt; - } - else { - pfd->client_data = NULL; - } - } - - if (timeout > 0) { - /* Prevent the timer from firing before the pollset is updated */ - if (timeout < TIMERS_FUDGE_TIMEOUT) { - timeout = TIMERS_FUDGE_TIMEOUT; - } - scb->cancel_event = get_timer_event(timeout, tofn, baton, 1, scb->pfds); - } - for (i = 0; i < scb->pfds->nelts; i++) { - apr_pollfd_t *pfd = (apr_pollfd_t *)scb->pfds->elts + i; - if (pfd->client_data) { - rc = apr_pollset_add(event_pollset, pfd); - if (rc != APR_SUCCESS) { - final_rc = rc; - } - } - } - return final_rc; -} - -static apr_status_t event_register_poll_callback(apr_pool_t *p, - const apr_array_header_t *pfds, - ap_mpm_callback_fn_t *cbfn, - void *baton) -{ - return event_register_poll_callback_ex(p, - pfds, - cbfn, - NULL, /* no timeout function */ - baton, - 0 /* no timeout */); + get_timer_event(timeout, cbfn, baton, 1); + return APR_SUCCESS; } /* @@ -2363,11 +2386,9 @@ static void process_lingering_close(event_conn_state_t *cs) conn_rec *c = cs->c; int rc = OK; - cs->pub.state = CONN_STATE_LINGER; - if (!cs->linger_started) { cs->linger_started = 1; /* once! */ - notify_suspend(cs); + cs->pub.state = CONN_STATE_LINGER; /* Shutdown the connection, i.e. pre_connection_close hooks, * SSL/TLS close notify, WC bucket, etc.. @@ -2431,8 +2452,7 @@ static void process_lingering_close(event_conn_state_t *cs) * Pre-condition: timeout_mutex must already be locked */ static unsigned int process_timeout_queue_ex(struct timeout_queue *queue, - apr_time_t now, - int shrink) + apr_time_t now, int shrink) { unsigned int count = 0; struct timeout_queue *q; @@ -2466,6 +2486,7 @@ static unsigned int process_timeout_queue_ex(struct timeout_queue *queue, break; } } + cs->timed_out = 1; if (cs_in_backlog(cs)) { /* Remove the backlog connection from worker_queue (note that @@ -2473,8 +2494,8 @@ static unsigned int process_timeout_queue_ex(struct timeout_queue *queue, * the backlog_q), and unreserve/set a worker/idler since * none could handle the event. */ - ap_assert(cs_qe(cs)->cb_baton == cs); ap_assert(cs->q == cs->sc->bl_q); + ap_assert(cs_qe(cs)->cb_baton == cs); ap_queue_info_idlers_inc(worker_queue_info); ap_queue_kill_event_locked(worker_queue, cs_qe(cs)); shutdown_connection(cs, now, 1); @@ -2588,7 +2609,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) apr_time_t next_expiry = -1; apr_interval_time_t timeout = -1; int workers_were_busy = 0, force_stats = 0; - socket_callback_baton_t *user_chain; + struct user_poll_baton *user_chain; const apr_pollfd_t *out_pfd; apr_time_t now, poll_time; event_conn_state_t *cs; @@ -2653,24 +2674,54 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) continue; } + /* A timer without a callback is a cancel event for a cs in + * either: + * 1. CONN_STATE_ASYNC_WAITIO: the timer enforces a timeout + * different from the cs->sc->io_q's; + * 2. CONN_STATE_SUSPENDED: the timer enforces a timeout for + * some user pollfds bound to the cs. + * In both cases te->baton is the (timed out) cs. + * For 1. we can shutdow the connection now, but for 2. we + * need to resume the suspended connection in a worker thread + * for the responsible module to know, which we do by setting + * CONN_STATE_LINGER but also cs->timed_out to make sure that, + * after the next/last ap_run_resume_connection(), this state + * is maintained/restored to issue the actual close. + */ if (!te->cbfunc) { cs = te->baton; + AP_DEBUG_ASSERT(cs != NULL); + AP_DEBUG_ASSERT(cs->te == te); put_timer_event(te, 1); - ap_assert(cs && cs->te == te); - ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, - "timed out connection %" CS_FMT, - CS_ARG(cs)); - (void)pollset_del(cs, 0); - kill_connection(cs, APR_TIMEUP); - continue; - } + cs->te = te = NULL; + cs->timed_out = 1; + + if (!cs->user_baton) { + ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c, + "timed out connection %" CS_FMT, + CS_ARG(cs)); + (void)pollset_del(cs, 0); + shutdown_connection(cs, now, 0); + continue; + } + + /* Remove all user pollfds from the pollset */ + AP_DEBUG_ASSERT(cs->user_baton->pfds != NULL); + apr_pool_cleanup_run(cs->user_baton->pool, cs->user_baton, + user_poll_cleanup); +#ifdef AP_DEBUG + memset(cs->user_baton, 0, sizeof(*cs->user_baton)); +#endif + cs->user_baton = NULL; - if (te->pfds) { - /* remove all sockets from the pollset */ - apr_pool_cleanup_run(te->pfds->pool, te->pfds, - event_cleanup_poll_callback); + AP_DEBUG_ASSERT(cs->suspended); + cs->pub.state = CONN_STATE_LINGER; } - push2worker(NULL, te, now, &workers_were_busy); + else { + cs = NULL; + } + + push2worker(cs, te, now, &workers_were_busy); } if (te) { next_expiry = te->when; @@ -2778,7 +2829,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) for (user_chain = NULL; num > 0; --num, ++out_pfd) { listener_poll_type *pt = out_pfd->client_data; - socket_callback_baton_t *baton; + struct user_poll_baton *user_baton; switch (pt->type) { case PT_CSD: @@ -2894,13 +2945,13 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) case PT_USER: /* Multiple pfds of the same baton might trigger in this pass * so chain once here and run the cleanup only after this loop - * to avoid lifetime issues (i.e. pfds->pool cleared while some - * of its pfd->client_data are still to be dereferenced here). + * to avoid lifetime issues (i.e. user_baton->pool cleared while + * some of its pfd->client_data are still to be dereferenced here). */ - baton = pt->baton; - if (baton != user_chain && !baton->next) { - baton->next = user_chain; - user_chain = baton; + user_baton = pt->baton; + if (user_baton != user_chain && !user_baton->next) { + user_baton->next = user_chain; + user_chain = user_baton; } break; } @@ -2908,27 +2959,32 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) /* Time to queue user callbacks chained above */ while (user_chain) { - socket_callback_baton_t *baton = user_chain; - user_chain = user_chain->next; - baton->next = NULL; + struct user_poll_baton *user_baton = user_chain; + user_chain = user_baton->next; + user_baton->next = NULL; + + cs = user_baton->cs; + AP_DEBUG_ASSERT(cs != NULL); + AP_DEBUG_ASSERT(cs->user_baton == user_baton); + AP_DEBUG_ASSERT(cs->te == user_baton->cancel_event); + AP_DEBUG_ASSERT(cs->pub.state == CONN_STATE_SUSPENDED); + AP_DEBUG_ASSERT(cs->suspended); /* Not expirable anymore */ - if (baton->cancel_event) { - baton->cancel_event->canceled = 1; - baton->cancel_event = NULL; + if (cs->te) { + cs->te->canceled = 1; + cs->te = NULL; } - /* remove all sockets from the pollset */ - apr_pool_cleanup_run(baton->pfds->pool, baton->pfds, - event_cleanup_poll_callback); + /* Remove all user pollfds from the pollset */ + apr_pool_cleanup_run(user_baton->pool, user_baton, + user_poll_cleanup); +#ifdef AP_DEBUG + memset(user_baton, 0, sizeof(*user_baton)); +#endif - /* masquerade as a timer event that is firing */ - te = get_timer_event(-1 /* fake timer */, - baton->cbfunc, - baton->user_baton, - 0, /* don't insert it */ - NULL /* no associated socket callback */); - push2worker(NULL, te, now, &workers_were_busy); + /* Schedule ap_run_resume_connection() */ + push2worker(cs, NULL, now, &workers_were_busy); } /* We process the timeout queues here only when the global @@ -2959,6 +3015,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) */ process_timeout_queue(shutdown_q, now); + /* No specific requirement/order for those */ process_timeout_queue(waitio_q, now); process_timeout_queue(write_completion_q, now); process_timeout_queue(keepalive_q, now); @@ -4433,7 +4490,6 @@ static void setup_slave_conn(conn_rec *c, void *csd) cs = make_conn_state(c->pool, csd); cs->c = c; cs->sc = mcs->sc; - cs->suspended = 0; cs->bucket_alloc = c->bucket_alloc; cs->pfd = mcs->pfd; cs->pub = mcs->pub; @@ -5085,14 +5141,11 @@ static void event_hooks(apr_pool_t * p) ap_hook_mpm_query(event_query, NULL, NULL, APR_HOOK_MIDDLE); ap_hook_mpm_register_timed_callback(event_register_timed_callback, NULL, NULL, APR_HOOK_MIDDLE); - ap_hook_mpm_register_poll_callback(event_register_poll_callback, - NULL, NULL, APR_HOOK_MIDDLE); - ap_hook_mpm_register_poll_callback_timeout(event_register_poll_callback_ex, - NULL, NULL, APR_HOOK_MIDDLE); ap_hook_pre_read_request(event_pre_read_request, NULL, NULL, APR_HOOK_MIDDLE); ap_hook_post_read_request(event_post_read_request, NULL, NULL, APR_HOOK_MIDDLE); ap_hook_mpm_get_name(event_get_name, NULL, NULL, APR_HOOK_MIDDLE); ap_hook_mpm_resume_suspended(event_resume_suspended, NULL, NULL, APR_HOOK_MIDDLE); + ap_hook_mpm_poll_suspended(event_poll_suspended, NULL, NULL, APR_HOOK_MIDDLE); ap_hook_pre_connection(event_pre_connection, NULL, NULL, APR_HOOK_REALLY_FIRST); ap_hook_protocol_switch(event_protocol_switch, NULL, NULL, APR_HOOK_REALLY_FIRST); diff --git a/server/mpm_common.c b/server/mpm_common.c index 2973bc9f4f2..d055fa2fd99 100644 --- a/server/mpm_common.c +++ b/server/mpm_common.c @@ -68,10 +68,9 @@ APR_HOOK_LINK(mpm) \ APR_HOOK_LINK(mpm_query) \ APR_HOOK_LINK(mpm_register_timed_callback) \ - APR_HOOK_LINK(mpm_register_poll_callback) \ - APR_HOOK_LINK(mpm_register_poll_callback_timeout) \ APR_HOOK_LINK(mpm_get_name) \ APR_HOOK_LINK(mpm_resume_suspended) \ + APR_HOOK_LINK(mpm_poll_suspended) \ APR_HOOK_LINK(end_generation) \ APR_HOOK_LINK(child_status) \ APR_HOOK_LINK(output_pending) \ @@ -111,16 +110,11 @@ AP_IMPLEMENT_HOOK_RUN_FIRST(apr_status_t, mpm_register_timed_callback, AP_IMPLEMENT_HOOK_RUN_FIRST(apr_status_t, mpm_resume_suspended, (conn_rec *c), (c), APR_ENOTIMPL) -AP_IMPLEMENT_HOOK_RUN_FIRST(apr_status_t, mpm_register_poll_callback, - (apr_pool_t *p, const apr_array_header_t *pds, - ap_mpm_callback_fn_t *cbfn, void *baton), - (p, pds, cbfn, baton), APR_ENOTIMPL) -AP_IMPLEMENT_HOOK_RUN_FIRST(apr_status_t, mpm_register_poll_callback_timeout, - (apr_pool_t *p, const apr_array_header_t *pds, - ap_mpm_callback_fn_t *cbfn, - ap_mpm_callback_fn_t *tofn, - void *baton, apr_time_t timeout), - (p, pds, cbfn, tofn, baton, timeout), APR_ENOTIMPL) +AP_IMPLEMENT_HOOK_RUN_FIRST(apr_status_t, mpm_poll_suspended, + (conn_rec *c, apr_pool_t *p, + const apr_array_header_t *pfds, + apr_interval_time_t timeout), + (c, p, pfds, timeout), APR_ENOTIMPL) AP_IMPLEMENT_HOOK_RUN_FIRST(int, output_pending, (conn_rec *c), (c), DECLINED) AP_IMPLEMENT_HOOK_RUN_FIRST(int, input_pending, @@ -573,26 +567,17 @@ AP_DECLARE(apr_status_t) ap_mpm_resume_suspended(conn_rec *c) return ap_run_mpm_resume_suspended(c); } -AP_DECLARE(apr_status_t) ap_mpm_register_timed_callback(apr_time_t t, - ap_mpm_callback_fn_t *cbfn, void *baton) +AP_DECLARE(apr_status_t) ap_mpm_poll_suspended(conn_rec *c, apr_pool_t *p, + const apr_array_header_t *pfds, + apr_interval_time_t timeout) { - return ap_run_mpm_register_timed_callback(t, cbfn, baton); + return ap_run_mpm_poll_suspended(c, p, pfds, timeout); } -AP_DECLARE(apr_status_t) ap_mpm_register_poll_callback( - apr_pool_t *p, const apr_array_header_t *pfds, +AP_DECLARE(apr_status_t) ap_mpm_register_timed_callback(apr_time_t t, ap_mpm_callback_fn_t *cbfn, void *baton) { - return ap_run_mpm_register_poll_callback(p, pfds, cbfn, baton); -} - -AP_DECLARE(apr_status_t) ap_mpm_register_poll_callback_timeout( - apr_pool_t *p, const apr_array_header_t *pfds, - ap_mpm_callback_fn_t *cbfn, ap_mpm_callback_fn_t *tofn, - void *baton, apr_time_t timeout) -{ - return ap_run_mpm_register_poll_callback_timeout(p, pfds, cbfn, tofn, - baton, timeout); + return ap_run_mpm_register_timed_callback(t, cbfn, baton); } AP_DECLARE(const char *)ap_show_mpm(void) diff --git a/server/mpm_fdqueue.h b/server/mpm_fdqueue.h index 29297fd60d5..4bb17c82955 100644 --- a/server/mpm_fdqueue.h +++ b/server/mpm_fdqueue.h @@ -89,7 +89,6 @@ struct timer_event_t ap_mpm_callback_fn_t *cbfunc; void *baton; int canceled; - apr_array_header_t *pfds; apr_interval_time_t timeout; }; typedef struct timer_event_t timer_event_t; From 92d0cdd150e7cab265dbd52b01a32954333eaf8a Mon Sep 17 00:00:00 2001 From: ylavic Date: Tue, 27 Jun 2023 01:54:48 +0200 Subject: [PATCH 22/22] mod_status: Be less racy, improve rendering, and show suspended connections. --- modules/generators/mod_status.c | 240 ++++++++++++++++++++------------ modules/lua/lua_request.c | 4 +- 2 files changed, 154 insertions(+), 90 deletions(-) diff --git a/modules/generators/mod_status.c b/modules/generators/mod_status.c index 5ff635cc96e..20187af882b 100644 --- a/modules/generators/mod_status.c +++ b/modules/generators/mod_status.c @@ -71,6 +71,7 @@ #define APR_WANT_STRFUNC #include "apr_want.h" #include "apr_strings.h" +#include "apr_atomic.h" #define STATUS_MAXLINE 64 @@ -199,10 +200,15 @@ static int status_handler(request_rec *r) int short_report; int no_table_report; global_score *global_record; - worker_score *ws_record; + volatile process_score *ps; process_score *ps_record; + worker_score *ws_record; char *stat_buffer; - pid_t *pid_buffer, worker_pid; + pid_t worker_pid; + struct { + pid_t pid; + ap_generation_t gen; + } *proc_buffer; int *thread_idle_buffer = NULL; int *thread_graceful_buffer = NULL; int *thread_busy_buffer = NULL; @@ -249,7 +255,7 @@ static int status_handler(request_rec *r) return HTTP_INTERNAL_SERVER_ERROR; } - pid_buffer = apr_palloc(r->pool, server_limit * sizeof(pid_t)); + proc_buffer = apr_palloc(r->pool, server_limit * sizeof(*proc_buffer)); stat_buffer = apr_palloc(r->pool, server_limit * thread_limit * sizeof(char)); if (is_async) { thread_idle_buffer = apr_palloc(r->pool, server_limit * sizeof(int)); @@ -311,6 +317,7 @@ static int status_handler(request_rec *r) } } + ps_record = apr_palloc(r->pool, sizeof *ps_record); ws_record = apr_palloc(r->pool, sizeof *ws_record); for (i = 0; i < server_limit; ++i) { @@ -319,7 +326,15 @@ static int status_handler(request_rec *r) clock_t tmp_tu, tmp_ts, tmp_tcu, tmp_tcs; #endif - ps_record = ap_get_scoreboard_process(i); + /* Snapshot all in one go */ + ps = ap_get_scoreboard_process(i); + do { + proc_buffer[i].pid = ps->pid; + proc_buffer[i].gen = ps->generation; + memcpy(ps_record, (void *)ps, sizeof(*ps_record)); + } while (ps_record->generation != proc_buffer[i].gen + || ps_record->pid != proc_buffer[i].pid); + if (is_async) { thread_idle_buffer[i] = 0; thread_graceful_buffer[i] = 0; @@ -328,7 +343,12 @@ static int status_handler(request_rec *r) for (j = 0; j < thread_limit; ++j) { int indx = (i * thread_limit) + j; - ap_copy_scoreboard_worker(ws_record, i, j); + if (ps_record->pid) { + ap_copy_scoreboard_worker(ws_record, i, j); + } + else { + memset(ws_record, 0, sizeof(*ws_record)); + } res = ws_record->status; if ((i >= max_servers || j >= threads_per_child) @@ -337,8 +357,8 @@ static int status_handler(request_rec *r) else stat_buffer[indx] = status_flags[res]; - if (!ps_record->quiescing - && ps_record->pid) { + if (ps_record->pid + && !ps_record->quiescing) { if (res == SERVER_READY) { if (ps_record->generation == mpm_generation) idle++; @@ -410,7 +430,6 @@ static int status_handler(request_rec *r) tcu += proc_tcu; tcs += proc_tcs; #endif - pid_buffer[i] = ps_record->pid; } /* up_time in seconds */ @@ -426,14 +445,15 @@ static int status_handler(request_rec *r) "

Apache Server Status for ", r); ap_rvputs(r, ap_escape_html(r->pool, ap_get_server_name(r)), " (via ", r->connection->local_ip, - ")

\n\n", NULL); - ap_rvputs(r, "
Server Version: ", + ")\n", NULL); + ap_rvputs(r, "
\n
Server Version: ", ap_get_server_description(), "
\n", NULL); - ap_rvputs(r, "
Server MPM: ", - ap_show_mpm(), "
\n", NULL); ap_rvputs(r, "
Server Built: ", - ap_get_server_built(), "\n

\n", NULL); - ap_rvputs(r, "
Current Time: ", + ap_get_server_built(), "
\n", NULL); + ap_rvputs(r, "
Server MPM: ", + ap_show_mpm(), "
\n
\n" + "
\n", NULL); + ap_rvputs(r, "
\n
Current Time: ", ap_ht_time(r->pool, nowtime, DEFAULT_TIME_FORMAT, 0), "
\n", NULL); ap_rvputs(r, "
Restart Time: ", @@ -561,97 +581,131 @@ static int status_handler(request_rec *r) ap_rprintf(r, "BusyWorkers: %d\nGracefulWorkers: %d\nIdleWorkers: %d\n", busy, graceful, idle); if (!short_report) - ap_rputs("
", r); + ap_rputs("
\n", r); if (is_async) { - int wait_io = 0, write_completion = 0, shutdown = 0, lingering_close = 0, - keep_alive = 0, connections = 0, stopping = 0, procs = 0; + apr_uint32_t procs = 0, stopping = 0, accepting = 0, + connections = 0, backlog = 0, wait_io = 0, writing = 0, + keep_alive = 0, shutdown = 0, suspended = 0, closing = 0; if (!short_report) - ap_rputs("\n\n\n" - "" - "" - "" - "\n" + ap_rputs("
SlotPIDStoppingConnections
\n" + "" "" - "\n" - "" - "" - "" - "\n", r); + "" + "\n" + "" + "" + "" + "" + "" + "\n", + r); for (i = 0; i < server_limit; ++i) { - ps_record = ap_get_scoreboard_process(i); - if (ps_record->pid) { + ps = ap_get_scoreboard_process(i); + if (!proc_buffer[i].pid + || ps->pid != proc_buffer[i].pid + || ps->generation != proc_buffer[i].gen) { + continue; + } + + /* Still the same as what we accounted for earlier? */ + memcpy(ps_record, (void *)ps, sizeof(*ps_record)); + if (ps_record->pid == proc_buffer[i].pid + && ps_record->generation == proc_buffer[i].gen) { connections += ps_record->connections; + backlog += ps_record->backlog; wait_io += ps_record->wait_io; - write_completion += ps_record->write_completion; + writing += ps_record->write_completion; keep_alive += ps_record->keep_alive; shutdown += ps_record->shutdown; - lingering_close += ps_record->lingering_close; + suspended += ps_record->suspended; + closing += ps_record->lingering_close; procs++; if (ps_record->quiescing) { stopping++; } + if (!ps_record->not_accepting) { + accepting++; + } if (!short_report) { const char *dying = "no"; const char *old = ""; + const char *listening = "yes"; if (ps_record->quiescing) { dying = "yes"; } - if (ps_record->generation != mpm_generation) + if (ps_record->generation != mpm_generation) { old = " (old gen)"; + } + if (ps_record->not_accepting) { + listening = "no"; + } ap_rprintf(r, "" - "" - "" - "" - "" - "\n", + "" + "" + "" + "" + "" + "\n", i, ps_record->pid, - dying, old, - ps_record->connections, - ps_record->not_accepting ? "no" : "yes", + dying, old, listening, + thread_idle_buffer[i], thread_busy_buffer[i], thread_graceful_buffer[i], - thread_idle_buffer[i], + ps_record->connections, + ps_record->backlog, ps_record->wait_io, ps_record->write_completion, ps_record->keep_alive, ps_record->shutdown, + ps_record->suspended, ps_record->lingering_close); } } } if (!short_report) { ap_rprintf(r, "" - "" - "" - "" - "" - "\n
ProcessesThreadsAsync connections
totalacceptingbusygracefulidlewait-iowritingkeep-aliveshutdownclosing
ConnectionsAsync queues
SlotPIDstoppingacceptingidlebusygracefultotalbacklogwait-iowritingkeep-aliveshutdownsuspendedclosing
%u%" APR_PID_T_FMT "%s%s%u%s%u%u%u%u%u%u%u%u
%s%s%s%d%d%d%u%u%u%u%u%u%u%u
Sum%d%d%d %d%d%d%d%d%d%d%d
\n", - procs, stopping, - connections, - busy, graceful, idle, - wait_io, write_completion, keep_alive, - shutdown, lingering_close); + "%u%u%u" + "%u%u%u" + "%u%u" + "%u%u" + "%u%u" + "%u%u\n" + "\n", + procs, stopping, accepting, + idle, busy, graceful, + connections, backlog, + wait_io, writing, + keep_alive, shutdown, + suspended, closing); } else { - ap_rprintf(r, "Processes: %d\n" - "Stopping: %d\n" - "ConnsTotal: %d\n" - "ConnsAsyncWaitIO: %d\n" - "ConnsAsyncWriting: %d\n" - "ConnsAsyncKeepAlive: %d\n" - "ConnsAsyncShutdown: %d\n" - "ConnsAsyncClosing: %d\n", - procs, stopping, - connections, - wait_io, write_completion, keep_alive, - shutdown, lingering_close); + ap_rprintf(r, "Processes: %u\n" + "Stopping: %u\n" + "Accepting: %u\n" + "ThreadsIdle: %u\n" + "ThreadsBusy: %u\n" + "ThreadsGraceful: %u\n" + "ConnsTotal: %u\n" + "ConnsBacklog: %u\n" + "ConnsAsyncWaitIO: %u\n" + "ConnsAsyncWriting: %u\n" + "ConnsAsyncKeepAlive: %u\n" + "ConnsAsyncShutdown: %u\n" + "ConnsAsyncSuspended: %u\n" + "ConnsAsyncClosing: %u\n", + procs, stopping, accepting, + busy, idle, graceful, + connections, backlog, + wait_io, writing, + keep_alive, shutdown, + suspended, closing); } } /* send the scoreboard 'table' out */ if (!short_report) - ap_rputs("
", r);
+        ap_rputs("
\n", r);
     else
         ap_rputs("Scoreboard: ", r);
 
@@ -673,11 +727,11 @@ static int status_handler(request_rec *r)
     if (short_report)
         ap_rputs("\n", r);
     else {
-        ap_rputs("
\n" + ap_rputs("\n
\n" "

Scoreboard Key:
\n" "\"_\" Waiting for Connection, \n" "\"S\" Starting up, \n" - "\"R\" Reading Request,
\n" + "\"R\" Waiting I/O,
\n" "\"W\" Sending Reply, \n" "\"K\" Keepalive (read), \n" "\"D\" DNS Lookup,
\n" @@ -690,17 +744,21 @@ static int status_handler(request_rec *r) if (!ap_extended_status) { int j; int k = 0; - ap_rputs("PID Key:
\n" + ap_rputs("

PID Key:
\n" "

\n", r);
             for (i = 0; i < server_limit; ++i) {
+                ps = ap_get_scoreboard_process(i);
+                if (!proc_buffer[i].pid
+                    || ps->pid != proc_buffer[i].pid
+                    || ps->generation != proc_buffer[i].gen) {
+                    continue;
+                }
                 for (j = 0; j < thread_limit; ++j) {
                     int indx = (i * thread_limit) + j;
 
-                    if (stat_buffer[indx] != '.') {
-                        ap_rprintf(r, "   %" APR_PID_T_FMT
-                                   " in state: %c ", pid_buffer[i],
-                                   stat_buffer[indx]);
-
+                    if (stat_buffer[indx] != status_flags[SERVER_DISABLED]) {
+                        ap_rprintf(r, "  %8" APR_PID_T_FMT " in state: %c ",
+                                   proc_buffer[i].pid, stat_buffer[indx]);
                         if (++k >= 3) {
                             ap_rputs("\n", r);
                             k = 0;
@@ -709,17 +767,16 @@ static int status_handler(request_rec *r)
                     }
                 }
             }
-
-            ap_rputs("\n"
-                     "
\n", r); + ap_rvputs(r, k ? "\n" : "", "\n", "

\n", NULL); } } if (ap_extended_status && !short_report) { if (no_table_report) - ap_rputs("

Server Details

\n\n", r); + ap_rputs("
\n

Server Details

\n", r); else - ap_rputs("\n\n" + ap_rputs("
\n" + "
" "" "" #ifdef HAVE_TIMES @@ -728,9 +785,16 @@ static int status_handler(request_rec *r) "" "" "" - "\n\n", r); + "\n", r); for (i = 0; i < server_limit; ++i) { + ps = ap_get_scoreboard_process(i); + if (!proc_buffer[i].pid + || ps->pid != proc_buffer[i].pid + || ps->generation != proc_buffer[i].gen) { + continue; + } + for (j = 0; j < thread_limit; ++j) { ap_copy_scoreboard_worker(ws_record, i, j); @@ -740,8 +804,6 @@ static int status_handler(request_rec *r) continue; } - ps_record = ap_get_scoreboard_process(i); - if (ws_record->start_time == 0L) req_time = 0L; else @@ -763,8 +825,8 @@ static int status_handler(request_rec *r) worker_generation = ws_record->generation; } else { - worker_pid = ps_record->pid; - worker_generation = ps_record->generation; + worker_pid = proc_buffer[i].pid; + worker_generation = proc_buffer[i].gen; } if (no_table_report) { @@ -842,7 +904,7 @@ static int status_handler(request_rec *r) format_byte_out(r, bytes); ap_rputs(")\n", r); ap_rprintf(r, - " %s {%s}(%s)[%s]
\n\n", + " %s {%s}(%s)[%s]
\n", ap_escape_html(r->pool, ws_record->client64), ap_escape_html(r->pool, @@ -929,7 +991,7 @@ static int status_handler(request_rec *r) (float)bytes / MBYTE); ap_rprintf(r, "" - "\n\n", + "\n", ap_escape_html(r->pool, ws_record->client64), ap_escape_html(r->pool, @@ -945,7 +1007,7 @@ static int status_handler(request_rec *r) if (!no_table_report) { ap_rputs("
SrvPIDAccMSSReqDurConnChildSlotClientProtocolVHostRequest
Request
%s%s%s%s
%s
\n \ -
\ +

\n \ \n \ \n \ \n \ @@ -962,13 +1024,15 @@ static int status_handler(request_rec *r) \n \ \n \ \n \ -
SrvChild Server number - generation
PIDOS process ID
ConnKilobytes transferred this connection
ChildMegabytes transferred this child
SlotTotal megabytes transferred this slot
\n", r); +\n \ +

", r); } } /* if (ap_extended_status && !short_report) */ else { if (!short_report) { - ap_rputs("
To obtain a full report with current status " + ap_rputs("
\n" + "To obtain a full report with current status " "information you need to use the " "ExtendedStatus On directive.\n", r); } @@ -986,7 +1050,7 @@ static int status_handler(request_rec *r) if (!short_report) { ap_rputs(ap_psignature("
\n",r), r); - ap_rputs("\n", r); + ap_rputs("\n\n", r); } return 0; diff --git a/modules/lua/lua_request.c b/modules/lua/lua_request.c index f93c3493af4..51cf63f565a 100644 --- a/modules/lua/lua_request.c +++ b/modules/lua/lua_request.c @@ -1268,11 +1268,11 @@ static int lua_ap_scoreboard_process(lua_State *L) lua_pushnumber(L, ps_record->suspended); lua_settable(L, -3); - lua_pushstring(L, "wait_io"); + lua_pushstring(L, "wait-io"); lua_pushnumber(L, ps_record->wait_io); lua_settable(L, -3); - lua_pushstring(L, "write_completion"); + lua_pushstring(L, "writing"); lua_pushnumber(L, ps_record->write_completion); lua_settable(L, -3);