From d865e09d9d62073ff6d723a7a48058dd04784ae4 Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Fri, 23 Jun 2023 19:16:05 +0200
Subject: [PATCH 01/22] core: Change filter input/output pending API to return
 OK/AGAIN/DONE.

Rename the ap_filter_input_pending() and ap_filter_output_pending() core
hooks to ap_core_input_pending() and ap_core_output_pending() respectively.
Change them to return AGAIN when some data are pending.

Add ap_check_input_pending() and ap_check_output_pending() to run the hooks and
fix the return value (DECLINED -> OK, c->aborted -> DONE).

Adapt the callers to the new API (DECLINED -> OK, OK -> AGAIN, DONE).
---
 include/ap_mmn.h              |  4 ++-
 include/httpd.h               |  2 ++
 include/mpm_common.h          | 18 -------------
 include/util_filter.h         | 48 ++++++++++++++++++++++++++++-------
 modules/http/http_request.c   |  2 +-
 modules/proxy/proxy_util.c    | 10 ++++----
 server/core.c                 |  4 +--
 server/mpm/event/event.c      | 12 ++++-----
 server/mpm/motorz/motorz.c    | 24 ++++++++++--------
 server/mpm/simple/simple_io.c | 23 ++++++++++-------
 server/util_filter.c          | 41 ++++++++++++++++++++++--------
 11 files changed, 117 insertions(+), 71 deletions(-)

diff --git a/include/ap_mmn.h b/include/ap_mmn.h
index ad4b77bd33a..acfa61e22b5 100644
--- a/include/ap_mmn.h
+++ b/include/ap_mmn.h
@@ -731,6 +731,8 @@
  *                         and AP_REQUEST_TRUSTED_CT BNOTE.
  * 20211221.24 (2.5.1-dev) Add ap_proxy_fixup_uds_filename()
  * 20211221.25 (2.5.1-dev) AP_SLASHES and AP_IS_SLASH
+ * 20211221.26 (2.5.1-dev) Add AGAIN, ap_check_input_pending() and
+ *                         ap_check_output_pending()
  */
 
 #define MODULE_MAGIC_COOKIE 0x41503235UL /* "AP25" */
@@ -738,7 +740,7 @@
 #ifndef MODULE_MAGIC_NUMBER_MAJOR
 #define MODULE_MAGIC_NUMBER_MAJOR 20211221
 #endif
-#define MODULE_MAGIC_NUMBER_MINOR 25             /* 0...n */
+#define MODULE_MAGIC_NUMBER_MINOR 26             /* 0...n */
 
 /**
  * Determine if the server's current MODULE_MAGIC_NUMBER is at least a
diff --git a/include/httpd.h b/include/httpd.h
index c02b3b7849b..c3f72fceb7e 100644
--- a/include/httpd.h
+++ b/include/httpd.h
@@ -465,6 +465,8 @@ AP_DECLARE(const char *) ap_get_server_built(void);
                          */
 #define SUSPENDED   -3  /**< Module will handle the remainder of the request.
                          *   The core will never invoke the request again */
+#define AGAIN       -4  /**< Module wants to be called again when
+                         *   more data is availble */
 
 /** Returned by the bottom-most filter if no data was written.
  *  @see ap_pass_brigade(). */
diff --git a/include/mpm_common.h b/include/mpm_common.h
index 334624ee065..34c61e2a6c2 100644
--- a/include/mpm_common.h
+++ b/include/mpm_common.h
@@ -450,24 +450,6 @@ AP_DECLARE_HOOK(apr_status_t, mpm_resume_suspended, (conn_rec*))
  */
 AP_DECLARE_HOOK(const char *,mpm_get_name,(void))
 
-/**
- * Hook called to determine whether we should stay within the write completion
- * phase.
- * @param c The current connection
- * @return OK if write completion should continue, DECLINED if write completion
- * should end gracefully, or a positive error if we should begin to linger.
- * @ingroup hooks
- */
-AP_DECLARE_HOOK(int, output_pending, (conn_rec *c))
-
-/**
- * Hook called to determine whether any data is pending in the input filters.
- * @param c The current connection
- * @return OK if we can read without blocking, DECLINED if a read would block.
- * @ingroup hooks
- */
-AP_DECLARE_HOOK(int, input_pending, (conn_rec *c))
-
 /**
  * Notification that connection handling is suspending (disassociating from the
  * current thread)
diff --git a/include/util_filter.h b/include/util_filter.h
index a03e81c16ca..6a21c486810 100644
--- a/include/util_filter.h
+++ b/include/util_filter.h
@@ -645,16 +645,15 @@ AP_DECLARE(void) ap_filter_adopt_brigade(ap_filter_t *f,
 AP_DECLARE(int) ap_filter_should_yield(ap_filter_t *f);
 
 /**
- * This function determines whether there is unwritten data in the output
+ * This function determines whether there is pending data in the output
  * filters, and if so, attempts to make a single write to each filter
- * with unwritten data.
+ * with pending data.
  *
  * @param c The connection.
- * @return If no unwritten data remains, this function returns DECLINED.
- * If some unwritten data remains, this function returns OK. If any
- * attempt to write data failed, this functions returns a positive integer.
+ * @return OK if no pending data remain, AGAIN if some remain, DONE
+ * if the connection is aborted, anything else on error.
  */
-AP_DECLARE_NONSTD(int) ap_filter_output_pending(conn_rec *c);
+AP_DECLARE(int) ap_check_output_pending(conn_rec *c);
 
 /**
  * This function determines whether there is pending data in the input
@@ -662,10 +661,41 @@ AP_DECLARE_NONSTD(int) ap_filter_output_pending(conn_rec *c);
  * socket but not yet returned to the application.
  *
  * @param c The connection.
- * @return If no pending data remains, this function returns DECLINED.
- * If some pending data remains, this function returns OK.
+ * @return OK if no pending data remain, AGAIN if some remain, DONE
+ * if the connection is aborted, anything else on error.
  */
-AP_DECLARE_NONSTD(int) ap_filter_input_pending(conn_rec *c);
+AP_DECLARE(int) ap_check_input_pending(conn_rec *c);
+
+/**
+ * Hook called to determine whether we should stay within the write completion
+ * phase.
+ * @param c The current connection
+ * @return OK if we can write without blocking, AGAIN if a write would block,
+ * DECLINED to let the next hook decide, DONE if the connection is aborted,
+ * anything else on error.
+ * @ingroup hooks
+ */
+AP_DECLARE_HOOK(int, output_pending, (conn_rec *c))
+
+/**
+ * Hook called to determine whether any data is pending in the input filters.
+ * @param c The current connection
+ * @return OK if we can read without blocking, AGAIN if a read would block,
+ * DECLINED to let the next hook decide, DONE if the connection is aborted,
+ * anything else on error.
+ * @ingroup hooks
+ */
+AP_DECLARE_HOOK(int, input_pending, (conn_rec *c))
+
+/**
+ * The core output_pending hook.
+ */
+AP_DECLARE_NONSTD(int) ap_core_output_pending(conn_rec *c);
+
+/**
+ * The core input_pending hook.
+ */
+AP_DECLARE_NONSTD(int) ap_core_input_pending(conn_rec *c);
 
 /**
  * Flush function for apr_brigade_* calls.  This calls ap_pass_brigade
diff --git a/modules/http/http_request.c b/modules/http/http_request.c
index 65c389125a7..77bf63edc5f 100644
--- a/modules/http/http_request.c
+++ b/modules/http/http_request.c
@@ -484,7 +484,7 @@ AP_DECLARE(void) ap_process_request(request_rec *r)
 
     ap_process_async_request(r);
 
-    if (ap_run_input_pending(c) != OK) {
+    if (ap_check_input_pending(c) != AGAIN) {
         bb = ap_acquire_brigade(c);
         b = apr_bucket_flush_create(c->bucket_alloc);
         APR_BRIGADE_INSERT_HEAD(bb, b);
diff --git a/modules/proxy/proxy_util.c b/modules/proxy/proxy_util.c
index cbc31104c37..88d174220d8 100644
--- a/modules/proxy/proxy_util.c
+++ b/modules/proxy/proxy_util.c
@@ -5888,12 +5888,12 @@ PROXY_DECLARE(int) ap_proxy_tunnel_run(proxy_tunnel_rec *tunnel)
                               "proxy: %s: %s output ready",
                               scheme, out->name);
 
-                rc = ap_filter_output_pending(out->c);
-                if (rc == OK) {
-                    /* Keep polling out (only) */
+                rc = ap_check_output_pending(out->c);
+                if (rc == AGAIN) {
+                    /* Keep polling (OUT only) */
                     continue;
                 }
-                if (rc != DECLINED) {
+                if (rc != OK) {
                     /* Real failure, bail out */
                     ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(10221)
                                   "proxy: %s: %s flushing failed (%i)",
@@ -5923,7 +5923,7 @@ PROXY_DECLARE(int) ap_proxy_tunnel_run(proxy_tunnel_rec *tunnel)
                     /* Flush any pending input data now, we don't know when
                      * the next POLLIN will trigger and retaining data might
                      * deadlock the underlying protocol. We don't check for
-                     * pending data first with ap_filter_input_pending() since
+                     * pending data first with ap_check_input_pending() since
                      * the read from proxy_tunnel_transfer() is nonblocking
                      * anyway and returning OK if there's no data.
                      */
diff --git a/server/core.c b/server/core.c
index 9f92981ef0d..4d5d569d93b 100644
--- a/server/core.c
+++ b/server/core.c
@@ -6150,9 +6150,9 @@ static void register_hooks(apr_pool_t *p)
     ap_hook_get_pollfd_from_conn(core_get_pollfd_from_conn, NULL, NULL,
                                  APR_HOOK_REALLY_LAST);
 
-    ap_hook_input_pending(ap_filter_input_pending, NULL, NULL,
+    ap_hook_input_pending(ap_core_input_pending, NULL, NULL,
                           APR_HOOK_MIDDLE);
-    ap_hook_output_pending(ap_filter_output_pending, NULL, NULL,
+    ap_hook_output_pending(ap_core_output_pending, NULL, NULL,
                            APR_HOOK_MIDDLE);
 
     /* register the core's insert_filter hook and register core-provided
diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index 403f9a3c939..ab49f34cf44 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -1233,7 +1233,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc
     }
 
     if (cs->pub.state == CONN_STATE_WRITE_COMPLETION) {
-        int pending = DECLINED;
+        int pending = OK;
 
         /* Flush all pending outputs before going to CONN_STATE_KEEPALIVE or
          * straight to CONN_STATE_PROCESSING if inputs are pending already.
@@ -1243,12 +1243,12 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc
 
         if (from_wc_q) {
             from_wc_q = 0; /* one shot */
-            pending = ap_run_output_pending(c);
+            pending = ap_check_output_pending(c);
         }
         else if (ap_filter_should_yield(c->output_filters)) {
-            pending = OK;
+            pending = AGAIN;
         }
-        if (pending == OK) {
+        if (pending == AGAIN) {
             /* Let the event thread poll for write */
             cs->queue_timestamp = apr_time_now();
             notify_suspend(cs);
@@ -1274,11 +1274,11 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc
             }
             return;
         }
-        if (pending != DECLINED || c->aborted || c->keepalive != AP_CONN_KEEPALIVE) {
+        if (pending != OK || c->aborted || c->keepalive != AP_CONN_KEEPALIVE) {
             cs->pub.state = CONN_STATE_LINGER;
             goto lingering_close;
         }
-        if (ap_run_input_pending(c) == OK) {
+        if (ap_check_input_pending(c) == AGAIN) {
             goto process_connection;
         }
         if (listener_may_exit) {
diff --git a/server/mpm/motorz/motorz.c b/server/mpm/motorz/motorz.c
index 8feff2965c2..7026d08cd6e 100644
--- a/server/mpm/motorz/motorz.c
+++ b/server/mpm/motorz/motorz.c
@@ -408,8 +408,8 @@ static apr_status_t motorz_io_process(motorz_conn_t *scon)
 
             ap_update_child_status(scon->sbh, SERVER_BUSY_WRITE, NULL);
 
-            pending = ap_run_output_pending(c);
-            if (pending == OK) {
+            pending = ap_check_output_pending(c);
+            if (pending == AGAIN) {
                 /* Still in WRITE_COMPLETION_STATE:
                  * Set a write timeout for this connection, and let the
                  * event thread poll for writeability.
@@ -432,18 +432,22 @@ static apr_status_t motorz_io_process(motorz_conn_t *scon)
                 }
                 return APR_SUCCESS;
             }
-            if (pending != DECLINED
-                    || c->keepalive != AP_CONN_KEEPALIVE
-                    || c->aborted) {
-                scon->cs.state = CONN_STATE_LINGER;
+            if (c->keepalive != AP_CONN_KEEPALIVE) {
+                pending = DONE;
             }
-            else if (ap_run_input_pending(c) == OK) {
-                scon->cs.state = CONN_STATE_PROCESSING;
-                goto read_request;
+            else if (pending == OK) {
+                pending = ap_check_input_pending(c);
+                if (pending == AGAIN) {
+                    scon->cs.state = CONN_STATE_PROCESSING;
+                    goto read_request;
+                }
             }
-            else {
+            if (pending == OK) {
                 scon->cs.state = CONN_STATE_KEEPALIVE;
             }
+            else {
+                scon->cs.state = CONN_STATE_LINGER;
+            }
         }
 
         if (scon->cs.state == CONN_STATE_LINGER) {
diff --git a/server/mpm/simple/simple_io.c b/server/mpm/simple/simple_io.c
index fb509ed756a..36c5ad87956 100644
--- a/server/mpm/simple/simple_io.c
+++ b/server/mpm/simple/simple_io.c
@@ -96,8 +96,8 @@ static apr_status_t simple_io_process(simple_conn_t * scon)
             int pending;
 
             ap_update_child_status(c->sbh, SERVER_BUSY_WRITE, NULL);
-            pending = ap_run_output_pending(c);
-            if (pending == OK) {
+            pending = ap_check_output_pending(c);
+            if (pending == AGAIN) {
                 /* Still in WRITE_COMPLETION_STATE:
                  * Set a write timeout for this connection, and let the
                  * event thread poll for writeability.
@@ -126,17 +126,22 @@ static apr_status_t simple_io_process(simple_conn_t * scon)
                 }
                 return APR_SUCCESS;
             }
-            if (pending != DECLINED
-                    || c->keepalive != AP_CONN_KEEPALIVE
-                    || c->aborted) {
-                scon->cs.state = CONN_STATE_LINGER;
+            if (c->keepalive != AP_CONN_KEEPALIVE) {
+                pending = DONE;
             }
-            else if (ap_run_input_pending(c) == OK) {
-                scon->cs.state = CONN_STATE_PROCESSING;
+            else if (pending == OK) {
+                pending = ap_check_input_pending(c);
+                if (pending == AGAIN) {
+                    scon->cs.state = CONN_STATE_PROCESSING;
+                    continue;
+                }
             }
-            else {
+            if (pending == OK) {
                 scon->cs.state = CONN_STATE_KEEPALIVE;
             }
+            else {
+                scon->cs.state = CONN_STATE_LINGER;
+            }
         }
 
         if (scon->cs.state == CONN_STATE_LINGER) {
diff --git a/server/util_filter.c b/server/util_filter.c
index 3b1e96fb447..d8dc103d80f 100644
--- a/server/util_filter.c
+++ b/server/util_filter.c
@@ -393,7 +393,7 @@ static apr_status_t request_filter_cleanup(void *arg)
 
     /* A request filter is cleaned up with an EOR bucket, so possibly
      * while it is handling/passing the EOR, and we want each filter or
-     * ap_filter_output_pending() to be able to dereference f until they
+     * ap_check_output_pending() to be able to dereference f until they
      * return. So request filters are recycled in dead_filters and will only
      * be moved to spare_filters when recycle_dead_filters() is called, i.e.
      * in ap_filter_{in,out}put_pending(). Set f->r to NULL still for any use
@@ -978,7 +978,7 @@ AP_DECLARE(apr_status_t) ap_filter_setaside_brigade(ap_filter_t *f,
              e = next) {
             next = APR_BUCKET_NEXT(e);
 
-            /* WC buckets will be added back by ap_filter_output_pending()
+            /* WC buckets will be added back by ap_check_output_pending()
              * at the tail.
              */
             if (AP_BUCKET_IS_WC(e)) {
@@ -1267,7 +1267,7 @@ AP_DECLARE(int) ap_filter_should_yield(ap_filter_t *f)
     return 0;
 }
 
-AP_DECLARE_NONSTD(int) ap_filter_output_pending(conn_rec *c)
+AP_DECLARE_NONSTD(int) ap_core_output_pending(conn_rec *c)
 {
     struct ap_filter_conn_ctx *x = c->filter_conn_ctx;
     struct ap_filter_private *fp, *prev;
@@ -1312,7 +1312,7 @@ AP_DECLARE_NONSTD(int) ap_filter_output_pending(conn_rec *c)
             }
 
             if (ap_filter_should_yield(f)) {
-                rc = OK;
+                rc = AGAIN;
                 break;
             }
         }
@@ -1320,15 +1320,26 @@ AP_DECLARE_NONSTD(int) ap_filter_output_pending(conn_rec *c)
     ap_release_brigade(c, bb);
 
 cleanup:
-    /* All filters have returned, time to recycle/unleak ap_filter_t-s
+    /* All filters have returned, time to recycle/unleak dead filters
      * before leaving (i.e. make them reusable).
      */
     recycle_dead_filters(c);
 
     return rc;
 }
+AP_DECLARE(int) ap_check_output_pending(conn_rec *c)
+{
+    int rc = ap_run_output_pending(c);
+    if (rc == DECLINED) {
+        rc = OK;
+    }
+    if (rc == OK && c->aborted) {
+        rc = DONE;
+    }
+    return rc;
+}
 
-AP_DECLARE_NONSTD(int) ap_filter_input_pending(conn_rec *c)
+AP_DECLARE_NONSTD(int) ap_core_input_pending(conn_rec *c)
 {
     struct ap_filter_conn_ctx *x = c->filter_conn_ctx;
     struct ap_filter_private *fp;
@@ -1349,21 +1360,31 @@ AP_DECLARE_NONSTD(int) ap_filter_input_pending(conn_rec *c)
          */
         AP_DEBUG_ASSERT(fp->bb);
         e = APR_BRIGADE_FIRST(fp->bb);
-        if (e != APR_BRIGADE_SENTINEL(fp->bb)
-                && e->length != (apr_size_t)(-1)) {
-            rc = OK;
+        if (e != APR_BRIGADE_SENTINEL(fp->bb) && e->length != (apr_size_t)-1) {
+            rc = AGAIN;
             break;
         }
     }
 
 cleanup:
-    /* All filters have returned, time to recycle/unleak ap_filter_t-s
+    /* All filters have returned, time to recycle/unleak dead filters
      * before leaving (i.e. make them reusable).
      */
     recycle_dead_filters(c);
 
     return rc;
 }
+AP_DECLARE(int) ap_check_input_pending(conn_rec *c)
+{
+    int rc = ap_run_input_pending(c);
+    if (rc == DECLINED) {
+        rc = OK;
+    }
+    if (rc == OK && c->aborted) {
+        rc = DONE;
+    }
+    return rc;
+}
 
 AP_DECLARE_NONSTD(apr_status_t) ap_filter_flush(apr_bucket_brigade *bb,
                                                 void *ctx)

From a92dd49afaeb3cbaec448c17cf8ab0eee18f8e65 Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Tue, 11 Jul 2023 20:26:41 +0200
Subject: [PATCH 02/22] mpm_event,listen: Improve/fix children maintenance when
 num_buckets > 1.

---
 server/listen.c          |  47 +++++-----
 server/mpm/event/event.c | 184 +++++++++++++++++++--------------------
 2 files changed, 113 insertions(+), 118 deletions(-)

diff --git a/server/listen.c b/server/listen.c
index 3aed907e21b..f32826e4774 100644
--- a/server/listen.c
+++ b/server/listen.c
@@ -333,9 +333,7 @@ static apr_status_t alloc_systemd_listener(process_rec * process,
     si.type = SOCK_STREAM;
     si.protocol = APR_PROTO_TCP;
 
-    rec = apr_palloc(process->pool, sizeof(ap_listen_rec));
-    rec->active = 0;
-    rec->next = 0;
+    rec = apr_pcalloc(process->pool, sizeof(ap_listen_rec));
 
     rv = apr_os_sock_make(&rec->sd, &si, process->pool);
     if (rv != APR_SUCCESS) {
@@ -462,8 +460,8 @@ static const char *alloc_listener(process_rec *process, const char *addr,
                                   apr_pool_t *temp_pool, apr_uint32_t flags)
 {
     ap_listen_rec *last;
+    apr_sockaddr_t *sa, *next_sa;
     apr_status_t status;
-    apr_sockaddr_t *sa;
 
     /* see if we've got a listener for this address:port, which is an error */
     if (find_listeners(&ap_listeners, NULL, addr, port, scope_id, temp_pool)) {
@@ -494,22 +492,23 @@ static const char *alloc_listener(process_rec *process, const char *addr,
         last = last->next;
     }
 
-    while (sa) {
+    for (; sa; sa = next_sa) {
         ap_listen_rec *new;
 
+        /* Each listener has its own (unlinked) address */
+        next_sa = sa->next;
+        sa->next = NULL;
+
         /* this has to survive restarts */
         new = apr_palloc(process->pool, sizeof(ap_listen_rec));
         new->active = 0;
-        new->next = 0;
+        new->next = NULL;
         new->bind_addr = sa;
         new->protocol = apr_pstrdup(process->pool, proto);
         new->flags = flags;
 
-        /* Go to the next sockaddr. */
-        sa = sa->next;
-
         status = apr_socket_create(&new->sd, new->bind_addr->family,
-                                    SOCK_STREAM, 0, process->pool);
+                                   SOCK_STREAM, 0, process->pool);
 
 #if APR_HAVE_IPV6
         /* What could happen is that we got an IPv6 address, but this system
@@ -861,36 +860,36 @@ AP_DECLARE(apr_status_t) ap_duplicate_listeners(apr_pool_t *p, server_rec *s,
         lr = ap_listeners;
         while (lr) {
             ap_listen_rec *duplr;
-            char *hostname;
-            apr_port_t port;
-            apr_sockaddr_t *sa;
 #ifdef HAVE_SYSTEMD
             if (use_systemd) {
                 int thesock;
                 apr_os_sock_get(&thesock, lr->sd);
                 if ((stat = alloc_systemd_listener(s->process, thesock,
-                    lr->protocol, &duplr)) != APR_SUCCESS) {
+                                                   lr->protocol, &duplr))) {
                     return stat;
                 }
             }
             else
 #endif
             {
-                duplr = apr_palloc(p, sizeof(ap_listen_rec));
-                duplr->slave = NULL;
+                duplr = apr_pcalloc(p, sizeof(ap_listen_rec));
                 duplr->protocol = apr_pstrdup(p, lr->protocol);
-                hostname = apr_pstrdup(p, lr->bind_addr->hostname);
-                port = lr->bind_addr->port;
-                stat = apr_sockaddr_info_get(&sa, hostname, APR_UNSPEC, port, 0, p);
+                duplr->flags = lr->flags;
+#if APR_VERSION_AT_LEAST(1,6,0)
+                stat = apr_sockaddr_info_copy(&duplr->bind_addr,
+                                              lr->bind_addr, p);
+#else
+                stat = apr_sockaddr_info_get(&duplr->bind_addr,
+                                             lr->bind_addr->hostname,
+                                             lr->bind_addr->family,
+                                             lr->bind_addr->port, 0, p);
+#endif
                 if (stat != APR_SUCCESS) {
                     ap_log_perror(APLOG_MARK, APLOG_CRIT, stat, p, APLOGNO(10397)
-                                  "failure looking up %s to duplicate "
-                                  "listening socket", hostname);
+                                  "failure duplicating address %pI for "
+                                  "listening socket", lr->bind_addr);
                     return stat;
                 }
-                duplr->bind_addr = sa;
-                duplr->next = NULL;
-                duplr->flags = lr->flags;
                 stat = apr_socket_create(&duplr->sd, duplr->bind_addr->family,
                                          SOCK_STREAM, 0, p);
                 if (stat != APR_SUCCESS) {
diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index ab49f34cf44..29a7b2bd345 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -436,13 +436,15 @@ typedef struct event_retained_data {
      * Workers that still active, i.e. are not shutting down gracefully.
      */
     int active_daemons;
+
     /*
      * idle_spawn_rate is the number of children that will be spawned on the
      * next maintenance cycle if there aren't enough idle servers.  It is
-     * maintained per listeners bucket, doubled up to MAX_SPAWN_RATE, and
-     * reset only when a cycle goes by without the need to spawn.
+     * doubled up to MAX_SPAWN_RATE, and reset only when a cycle goes by
+     * without the need to spawn.
      */
-    int *idle_spawn_rate;
+    int idle_spawn_rate;
+    int max_spawn_rate, *free_slots;
     int hold_off_on_exponential_spawning;
 } event_retained_data;
 static event_retained_data *retained;
@@ -450,7 +452,6 @@ static event_retained_data *retained;
 #ifndef MAX_SPAWN_RATE
 #define MAX_SPAWN_RATE 32
 #endif
-static int max_spawn_rate_per_bucket = MAX_SPAWN_RATE / 1;
 
 struct event_srv_cfg_s {
     struct timeout_queue *io_q,
@@ -3144,9 +3145,9 @@ static void child_main(int child_num_arg, int child_bucket)
     clean_child_exit(resource_shortage ? APEXIT_CHILDSICK : 0);
 }
 
-static int make_child(server_rec * s, int slot, int bucket)
+static int make_child(server_rec *s, int slot)
 {
-    int pid;
+    int pid, bucket = slot % retained->mpm->num_buckets;
 
     if (slot + 1 > retained->max_daemon_used) {
         retained->max_daemon_used = slot + 1;
@@ -3230,32 +3231,27 @@ static void startup_children(int number_to_start)
         if (ap_scoreboard_image->parent[i].pid != 0) {
             continue;
         }
-        if (make_child(ap_server_conf, i, i % retained->mpm->num_buckets) < 0) {
+        if (make_child(ap_server_conf, i) < 0) {
             break;
         }
         --number_to_start;
     }
 }
 
-static void perform_idle_server_maintenance(int child_bucket,
-                                            int *max_daemon_used)
+static void perform_idle_server_maintenance(void)
 {
-    int num_buckets = retained->mpm->num_buckets;
-    int idle_thread_count = 0;
-    process_score *ps;
-    int free_length = 0;
-    int free_slots[MAX_SPAWN_RATE];
+    volatile process_score *ps;
+    const int num_buckets = retained->mpm->num_buckets;
     int last_non_dead = -1;
+    int free_length = 0, free_bucket = 0;
+    int max_daemon_used = 0;
+    int idle_thread_count = 0;
     int active_thread_count = 0;
     int i, j;
 
     for (i = 0; i < server_limit; ++i) {
-        if (num_buckets > 1 && (i % num_buckets) != child_bucket) {
-            /* We only care about child_bucket in this call */
-            continue;
-        }
         if (i >= retained->max_daemon_used &&
-            free_length == retained->idle_spawn_rate[child_bucket]) {
+            free_length == retained->idle_spawn_rate) {
             /* short cut if all active processes have been examined and
              * enough empty scoreboard slots have been found
              */
@@ -3299,12 +3295,16 @@ static void perform_idle_server_maintenance(int child_bucket,
             }
             last_non_dead = i;
         }
-        else if (free_length < retained->idle_spawn_rate[child_bucket]) {
-            free_slots[free_length++] = i;
+        else if (free_length < retained->idle_spawn_rate
+                 && (i % num_buckets) == free_bucket) {
+            retained->free_slots[free_length++] = i;
+            if (++free_bucket == num_buckets) {
+                free_bucket = 0;
+            }
         }
     }
-    if (*max_daemon_used < last_non_dead + 1) {
-        *max_daemon_used = last_non_dead + 1;
+    if (max_daemon_used < last_non_dead + 1) {
+        max_daemon_used = last_non_dead + 1;
     }
 
     if (retained->sick_child_detected) {
@@ -3315,10 +3315,6 @@ static void perform_idle_server_maintenance(int child_bucket,
              */
             retained->sick_child_detected = 0;
         }
-        else if (child_bucket < num_buckets - 1) {
-            /* check for had_healthy_child up to the last child bucket */
-            return;
-        }
         else {
             /* looks like a basket case, as no child ever fully initialized; give up.
              */
@@ -3338,7 +3334,7 @@ static void perform_idle_server_maintenance(int child_bucket,
                     && retained->total_daemons <= retained->max_daemon_used
                     && retained->max_daemon_used <= server_limit);
 
-    if (idle_thread_count > max_spare_threads / num_buckets) {
+    if (idle_thread_count > max_spare_threads) {
         /*
          * Child processes that we ask to shut down won't die immediately
          * but may stay around for a long time when they finish their
@@ -3367,17 +3363,19 @@ static void perform_idle_server_maintenance(int child_bucket,
                      retained->total_daemons, retained->max_daemon_used,
                      server_limit, idle_thread_count, max_workers);
         if (do_kill) {
-            ap_mpm_podx_signal(retained->buckets[child_bucket].pod,
-                               AP_MPM_PODX_GRACEFUL);
+            for (i = 0; i < num_buckets; ++i) {
+                ap_mpm_podx_signal(retained->buckets[i].pod,
+                                   AP_MPM_PODX_GRACEFUL);
+            }
         }
         else {
             /* Wait for dying daemon(s) to exit */
         }
-        retained->idle_spawn_rate[child_bucket] = 1;
+        retained->idle_spawn_rate = num_buckets;
     }
-    else if (idle_thread_count < min_spare_threads / num_buckets) {
-        if (active_thread_count >= max_workers / num_buckets) {
-            if (0 == idle_thread_count) { 
+    else if (idle_thread_count < min_spare_threads) {
+        if (active_thread_count >= max_workers) {
+            if (0 == idle_thread_count) {
                 if (!retained->maxclients_reported) {
                     ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf, APLOGNO(00484)
                                  "server reached MaxRequestWorkers setting, "
@@ -3395,18 +3393,15 @@ static void perform_idle_server_maintenance(int child_bucket,
                     retained->near_maxclients_reported = 1;
                 }
             }
-            retained->idle_spawn_rate[child_bucket] = 1;
+            retained->idle_spawn_rate = num_buckets;
         }
         else if (free_length == 0) { /* scoreboard is full, can't fork */
             ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf, APLOGNO(03490)
                          "scoreboard is full, not at MaxRequestWorkers."
                          "Increase ServerLimit.");
-            retained->idle_spawn_rate[child_bucket] = 1;
+            retained->idle_spawn_rate = num_buckets;
         }
         else {
-            if (free_length > retained->idle_spawn_rate[child_bucket]) {
-                free_length = retained->idle_spawn_rate[child_bucket];
-            }
             if (free_length + retained->active_daemons > active_daemons_limit) {
                 if (retained->active_daemons < active_daemons_limit) {
                     free_length = active_daemons_limit - retained->active_daemons;
@@ -3418,14 +3413,13 @@ static void perform_idle_server_maintenance(int child_bucket,
                                  "total %d/%d/%d, rate %d", free_length,
                                  retained->active_daemons, active_daemons_limit,
                                  retained->total_daemons, retained->max_daemon_used,
-                                 server_limit, retained->idle_spawn_rate[child_bucket]);
+                                 server_limit, retained->idle_spawn_rate);
                     /* reset the spawning rate and prevent its growth below */
-                    retained->idle_spawn_rate[child_bucket] = 1;
-                    ++retained->hold_off_on_exponential_spawning;
+                    retained->idle_spawn_rate = num_buckets;
                     free_length = 0;
                 }
             }
-            if (retained->idle_spawn_rate[child_bucket] >= 8) {
+            if (retained->idle_spawn_rate >= retained->max_spawn_rate / 4) {
                 ap_log_error(APLOG_MARK, APLOG_INFO, 0, ap_server_conf, APLOGNO(00486)
                              "server seems busy, (you may need "
                              "to increase StartServers, ThreadsPerChild "
@@ -3436,13 +3430,14 @@ static void perform_idle_server_maintenance(int child_bucket,
                              idle_thread_count, retained->active_daemons,
                              retained->total_daemons);
             }
+            free_length = (free_length / num_buckets) * num_buckets;
             for (i = 0; i < free_length; ++i) {
-                int slot = free_slots[i];
-                if (make_child(ap_server_conf, slot, child_bucket) < 0) {
+                int slot = retained->free_slots[i];
+                if (make_child(ap_server_conf, slot) < 0) {
                     continue;
                 }
-                if (*max_daemon_used < slot + 1) {
-                    *max_daemon_used = slot + 1;
+                if (max_daemon_used < slot + 1) {
+                    max_daemon_used = slot + 1;
                 }
             }
             /* the next time around we want to spawn twice as many if this
@@ -3451,31 +3446,41 @@ static void perform_idle_server_maintenance(int child_bucket,
             if (retained->hold_off_on_exponential_spawning) {
                 --retained->hold_off_on_exponential_spawning;
             }
-            else if (retained->idle_spawn_rate[child_bucket]
-                     < max_spawn_rate_per_bucket) {
-                int new_rate = retained->idle_spawn_rate[child_bucket] * 2;
-                if (new_rate > max_spawn_rate_per_bucket) {
-                    new_rate = max_spawn_rate_per_bucket;
+            else if (free_length && retained->idle_spawn_rate < retained->max_spawn_rate) {
+                int new_rate = retained->idle_spawn_rate * 2;
+                new_rate = ((new_rate + num_buckets - 1) / num_buckets) * num_buckets;
+                if (new_rate > retained->max_spawn_rate) {
+                    new_rate = retained->max_spawn_rate;
                 }
-                retained->idle_spawn_rate[child_bucket] = new_rate;
+                retained->idle_spawn_rate = new_rate;
             }
         }
     }
     else {
-        retained->idle_spawn_rate[child_bucket] = 1;
+        retained->idle_spawn_rate = num_buckets;
+    }
+
+    retained->max_daemon_used = max_daemon_used;
+    if (APLOGdebug(ap_server_conf)) {
+        ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf,
+                     "score: idlers:%d, "
+                     "threads active:%d/%d max:%d, "
+                     "daemons active:%d/%d max:%d used:%d/%d/%d",
+                     idle_thread_count,
+                     active_thread_count, retained->active_daemons * threads_per_child,
+                     max_workers, retained->active_daemons, retained->total_daemons,
+                     active_daemons_limit, max_daemon_used, retained->max_daemon_used,
+                     server_limit);
     }
 }
 
 static void server_main_loop(int remaining_children_to_start)
 {
-    int num_buckets = retained->mpm->num_buckets;
-    int max_daemon_used = 0;
     int successive_kills = 0;
     int child_slot;
     apr_exit_why_e exitwhy;
     int status, processed_status;
     apr_proc_t pid;
-    int i;
 
     while (!retained->mpm->restart_pending && !retained->mpm->shutdown_pending) {
         ap_wait_or_timeout(&exitwhy, &status, &pid, pconf, ap_server_conf);
@@ -3520,14 +3525,13 @@ static void server_main_loop(int remaining_children_to_start)
 
                 if (processed_status == APEXIT_CHILDSICK) {
                     /* resource shortage, minimize the fork rate */
-                    retained->idle_spawn_rate[child_slot % num_buckets] = 1;
+                    retained->idle_spawn_rate = retained->mpm->num_buckets;
                 }
                 else if (remaining_children_to_start) {
                     /* we're still doing a 1-for-1 replacement of dead
                      * children with new children
                      */
-                    make_child(ap_server_conf, child_slot,
-                               child_slot % num_buckets);
+                    make_child(ap_server_conf, child_slot);
                     --remaining_children_to_start;
                 }
             }
@@ -3586,11 +3590,7 @@ static void server_main_loop(int remaining_children_to_start)
             continue;
         }
 
-        max_daemon_used = 0;
-        for (i = 0; i < num_buckets; i++) {
-            perform_idle_server_maintenance(i, &max_daemon_used);
-        }
-        retained->max_daemon_used = max_daemon_used;
+        perform_idle_server_maintenance();
     }
 }
 
@@ -3680,35 +3680,36 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
         retained->buckets[i].listeners = listen_buckets[i];
     }
 
-    if (retained->mpm->max_buckets < num_buckets) {
-        int new_max, *new_ptr;
-        new_max = retained->mpm->max_buckets * 2;
-        if (new_max < num_buckets) {
-            new_max = num_buckets;
+    /* If num_buckets changed, adjust max_spawn_rate and the free_slots buffer */
+    if (retained->mpm->num_buckets != num_buckets) {
+        if (retained->mpm->max_buckets < num_buckets) {
+            int new_max, new_slots;
+            new_max = retained->mpm->max_buckets * 2;
+            if (new_max < num_buckets) {
+                new_max = num_buckets;
+            }
+            else {
+                new_max = ((new_max + num_buckets - 1) / num_buckets) * num_buckets;
+            }
+            new_slots = ((MAX_SPAWN_RATE + new_max - 1) / new_max) * new_max;
+            retained->free_slots = apr_palloc(ap_pglobal, new_slots * sizeof(int));
+            retained->mpm->max_buckets = new_max;
         }
-        new_ptr = (int *)apr_palloc(ap_pglobal, new_max * sizeof(int));
-        if (retained->mpm->num_buckets) /* idle_spawn_rate NULL at startup */
-            memcpy(new_ptr, retained->idle_spawn_rate,
-                   retained->mpm->num_buckets * sizeof(int));
-        retained->idle_spawn_rate = new_ptr;
-        retained->mpm->max_buckets = new_max;
-    }
-    if (retained->mpm->num_buckets < num_buckets) {
-        int rate_max = 1;
-        /* If new buckets are added, set their idle spawn rate to
-         * the highest so far, so that they get filled as quickly
-         * as the existing ones.
+        /* We always spawn/kill children in a multiple of num_buckets (as needed),
+         * so align (round up) max_spawn_rate and idle_spawn_rate to num_buckets.
          */
-        for (i = 0; i < retained->mpm->num_buckets; i++) {
-            if (rate_max < retained->idle_spawn_rate[i]) {
-                rate_max = retained->idle_spawn_rate[i];
-            }
+        retained->max_spawn_rate = (((MAX_SPAWN_RATE + num_buckets - 1)
+                                     / num_buckets) * num_buckets);
+        retained->idle_spawn_rate = (((retained->idle_spawn_rate + num_buckets - 1)
+                                      / num_buckets) * num_buckets);
+        if (retained->idle_spawn_rate < num_buckets) {
+            retained->idle_spawn_rate = num_buckets;
         }
-        for (/* up to date i */; i < num_buckets; i++) {
-            retained->idle_spawn_rate[i] = rate_max;
+        else if (retained->idle_spawn_rate > retained->max_spawn_rate) {
+            retained->idle_spawn_rate = retained->max_spawn_rate;
         }
+        retained->mpm->num_buckets = num_buckets;
     }
-    retained->mpm->num_buckets = num_buckets;
 
     /* Don't thrash since num_buckets depends on the
      * system and the number of online CPU cores...
@@ -3728,11 +3729,6 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
     if (max_spare_threads < min_spare_threads + (threads_per_child + 1) * num_buckets)
         max_spare_threads = min_spare_threads + (threads_per_child + 1) * num_buckets;
 
-    max_spawn_rate_per_bucket = (MAX_SPAWN_RATE + num_buckets - 1) / num_buckets;
-    if (max_spawn_rate_per_bucket < 1) {
-        max_spawn_rate_per_bucket = 1;
-    }
-
     /* If we're doing a graceful_restart then we're going to see a lot
      * of children exiting immediately when we get into the main loop
      * below (because we just sent them AP_SIG_GRACEFUL).  This happens pretty

From b85d7387f59eff51261744997abf81b3cc7c9d55 Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Mon, 3 Jun 2024 15:35:47 +0200
Subject: [PATCH 03/22] mpm_event: Add helpers, simplify code and improve
 logging before functional changes.

---
 modules/http/http_request.c   |    3 +-
 server/mpm/event/event.c      | 1338 +++++++++++++++++++--------------
 server/mpm/motorz/motorz.c    |   14 +-
 server/mpm/simple/simple_io.c |   10 +-
 4 files changed, 787 insertions(+), 578 deletions(-)

diff --git a/modules/http/http_request.c b/modules/http/http_request.c
index 77bf63edc5f..c8f157eca80 100644
--- a/modules/http/http_request.c
+++ b/modules/http/http_request.c
@@ -431,7 +431,8 @@ void ap_process_async_request(request_rec *r)
         const apr_array_header_t *t_h = apr_table_elts(r->headers_in);
         const apr_table_entry_t *t_elt = (apr_table_entry_t *)t_h->elts;
         ap_log_rerror(APLOG_MARK, APLOG_TRACE4, 0, r,
-                      "Headers received from client:");
+                      "Header received from client:");
+        ap_log_rerror(APLOG_MARK, APLOG_TRACE4, 0, r, "  %s", r->the_request);
         for (i = 0; i < t_h->nelts; i++, t_elt++) {
             ap_log_rerror(APLOG_MARK, APLOG_TRACE4, 0, r, "  %s: %s",
                           ap_escape_logitem(r->pool, t_elt->key),
diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index 29a7b2bd345..64ff1e30ead 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -149,7 +149,7 @@
 #define MPM_CHILD_PID(i) (ap_scoreboard_image->parent[i].pid)
 
 #if !APR_VERSION_AT_LEAST(1,4,0)
-#define apr_time_from_msec(x) (x * 1000)
+#define apr_time_from_msec(x) ((x) * 1000)
 #endif
 
 #define CONN_STATE_IS_LINGERING_CLOSE(s) ((s) >= CONN_STATE_LINGER && \
@@ -159,6 +159,21 @@
 #endif
 #define SECONDS_TO_LINGER  2
 
+/* Don't wait more time in poll() if APR_POLLSET_WAKEABLE is not implemented */
+#define NON_WAKEABLE_TIMEOUT    apr_time_from_msec(100)
+
+/* Prevent extra poll/wakeup calls for timeouts close in the future (queues
+ * have the granularity of a second anyway).
+ * XXX: Wouldn't 0.5s (instead of 0.1s) be "enough"?
+ */
+#define QUEUES_FUDGE_TIMEOUT    apr_time_from_msec(100)
+
+/* Same goal as for QUEUES_FUDGE_TIMEOUT, but applied to timers.
+ * XXX: Since their timeouts are custom (user defined), we can't be too
+ * approximative here (using 5ms).
+ */
+#define TIMERS_FUDGE_TIMEOUT    apr_time_from_msec(5)
+
 /*
  * Actual definitions of config globals
  */
@@ -178,26 +193,27 @@ static int active_daemons_limit = 0;        /* MaxRequestWorkers / ThreadsPerChi
 static int max_workers = 0;                 /* MaxRequestWorkers */
 static int server_limit = 0;                /* ServerLimit */
 static int thread_limit = 0;                /* ThreadLimit */
-static int had_healthy_child = 0;
+static int conns_this_child = 0;            /* MaxConnectionsPerChild, only accessed
+                                               in listener thread */
 static volatile int dying = 0;
 static volatile int workers_may_exit = 0;
 static volatile int start_thread_may_exit = 0;
 static volatile int listener_may_exit = 0;
-static int listener_is_wakeable = 0;        /* Pollset supports APR_POLLSET_WAKEABLE */
-static int num_listensocks = 0;
-static apr_int32_t conns_this_child;        /* MaxConnectionsPerChild, only access
-                                               in listener thread */
 static apr_uint32_t connection_count = 0;   /* Number of open connections */
 static apr_uint32_t lingering_count = 0;    /* Number of connections in lingering close */
 static apr_uint32_t suspended_count = 0;    /* Number of suspended connections */
 static apr_uint32_t clogged_count = 0;      /* Number of threads processing ssl conns */
 static apr_uint32_t threads_shutdown = 0;   /* Number of threads that have shutdown
                                                early during graceful termination */
+static int had_healthy_child = 0;
 static int resource_shortage = 0;
+
 static fd_queue_t *worker_queue;
 static fd_queue_info_t *worker_queue_info;
 
-static apr_thread_mutex_t *timeout_mutex;
+static int num_listensocks = 0;
+static int listener_is_wakeable = 0; /* Pollset supports APR_POLLSET_WAKEABLE */
+static apr_pollfd_t *listener_pollfd;
 
 module AP_MODULE_DECLARE_DATA mpm_event_module;
 
@@ -205,7 +221,8 @@ module AP_MODULE_DECLARE_DATA mpm_event_module;
 struct event_srv_cfg_s;
 typedef struct event_srv_cfg_s event_srv_cfg;
 
-static apr_pollfd_t *listener_pollfd;
+struct timeout_queue;
+static apr_thread_mutex_t *timeout_mutex;
 
 /*
  * The pollset for sockets that are in any of the timeout queues. Currently
@@ -219,18 +236,13 @@ static apr_pollfd_t *listener_pollfd;
 static apr_pollset_t *event_pollset;
 
 typedef struct event_conn_state_t event_conn_state_t;
-
-/*
- * The chain of connections to be shutdown by a worker thread (deferred),
- * linked list updated atomically.
- */
-static event_conn_state_t *volatile defer_linger_chain;
-
 struct event_conn_state_t {
     /** APR_RING of expiration timeouts */
     APR_RING_ENTRY(event_conn_state_t) timeout_list;
-    /** the time when the entry was queued */
-    apr_time_t queue_timestamp;
+    /** public parts of the connection state */
+    conn_state_t pub;
+    /** memory pool allocated on and to allocate from (ptrans) */
+    apr_pool_t *p;
     /** connection record this struct refers to */
     conn_rec *c;
     /** request record (if any) this struct refers to */
@@ -239,37 +251,101 @@ struct event_conn_state_t {
     event_srv_cfg *sc;
     /** scoreboard handle for the conn_rec */
     ap_sb_handle_t *sbh;
-    /** is the current conn_rec suspended?  (disassociated with
-     * a particular MPM thread; for suspend_/resume_connection
-     * hooks)
-     */
-    int suspended;
-    /** memory pool to allocate from */
-    apr_pool_t *p;
     /** bucket allocator */
     apr_bucket_alloc_t *bucket_alloc;
+
+    /*
+     * when queued to the listener
+     */
     /** poll file descriptor information */
     apr_pollfd_t pfd;
-    /** public parts of the connection state */
-    conn_state_t pub;
+    /** the time when the entry was queued */
+    apr_time_t queue_timestamp;
+    /** the timeout queue for this entry */
+    struct timeout_queue *q;
+
+    /*
+     * when queued to workers
+     */
     /** chaining in defer_linger_chain */
     struct event_conn_state_t *chain;
-    unsigned int 
+
+    /*
+     * bools as bits
+     */
+    unsigned int
+        /** Is the current conn_rec suspended?  (disassociated with
+         *  a particular MPM thread; for suspend_/resume_connection
+         *  hooks)
+         */
+        suspended       :1,
         /** Is lingering close from defer_lingering_close()? */
         deferred_linger :1,
         /** Has ap_start_lingering_close() been called? */
         linger_started  :1;
 };
 
-APR_RING_HEAD(timeout_head_t, event_conn_state_t);
+static APR_INLINE apr_socket_t *cs_sd(event_conn_state_t *cs)
+{
+    ap_assert(cs != NULL);
+    return cs->pfd.desc.s;
+}
+static APR_INLINE int cs_fd(event_conn_state_t *cs)
+{
+    apr_os_sock_t fd = -1;
+    apr_os_sock_get(&fd, cs_sd(cs));
+    return fd;
+}
+static APR_INLINE apr_sockaddr_t *cs_raddr(event_conn_state_t *cs)
+{
+    apr_sockaddr_t *addr = NULL;
+    apr_socket_addr_get(&addr, APR_REMOTE, cs_sd(cs));
+    return addr;
+}
+static APR_INLINE const char *cs_state_str(event_conn_state_t *cs)
+{
+    switch (cs->pub.state) {
+    case CONN_STATE_PROCESSING:
+        return "STATE_PROCESSING";
+    case CONN_STATE_HANDLER:
+        return "STATE_HANDLER";
+    case CONN_STATE_ASYNC_WAITIO:
+        return "STATE_ASYNC_WAITIO";
+    case CONN_STATE_WRITE_COMPLETION:
+        return "STATE_WRITE_COMPLETION";
+    case CONN_STATE_KEEPALIVE:
+        return "STATE_KEEPALIVE";
+    case CONN_STATE_LINGER:
+    case CONN_STATE_LINGER_NORMAL:
+    case CONN_STATE_LINGER_SHORT:
+        return "STATE_LINGER";
+    case CONN_STATE_SUSPENDED:
+        return "STATE_SUSPENDED";
+    default:
+        return "STATE_UNKNOWN";
+    }
+}
+#define CS_FMT          "pp:%s:%i"
+#define CS_ARG(cs)      (cs), cs_state_str(cs), cs_fd(cs)
+#define CS_FMT_TO       CS_FMT " to [%pI]"
+#define CS_ARG_TO(cs)   CS_ARG(cs), cs_raddr(cs)
 
+/*
+ * The chain of connections to be shutdown by a worker thread (deferred),
+ * linked list updated atomically.
+ */
+static event_conn_state_t *volatile defer_linger_chain;
+
+APR_RING_HEAD(timeout_head_t, event_conn_state_t);
 struct timeout_queue {
     struct timeout_head_t head;
     apr_interval_time_t timeout;
     apr_uint32_t count;         /* for this queue */
     apr_uint32_t *total;        /* for all chained/related queues */
+    const char *name;           /* for logging */
     struct timeout_queue *next; /* chaining */
 };
+
 /*
  * Several timeout queues that use different timeouts, so that we always can
  * simply append to the end.
@@ -279,39 +355,38 @@ struct timeout_queue {
  *   linger_q           uses MAX_SECS_TO_LINGER
  *   short_linger_q     uses SECONDS_TO_LINGER
  */
-static struct timeout_queue *waitio_q,
-                            *write_completion_q,
-                            *keepalive_q,
-                            *linger_q,
-                            *short_linger_q;
-static volatile apr_time_t  queues_next_expiry;
+static struct timeout_queue *waitio_q,           /* wait for I/O to happen */
+                            *write_completion_q, /* completion or user async poll */
+                            *keepalive_q,        /* in between requests */
+                            *linger_q,           /* lingering (read) before close */
+                            *short_linger_q;     /* lingering (read) before close (short timeout) */
 
-/* Prevent extra poll/wakeup calls for timeouts close in the future (queues
- * have the granularity of a second anyway).
- * XXX: Wouldn't 0.5s (instead of 0.1s) be "enough"?
- */
-#define TIMEOUT_FUDGE_FACTOR apr_time_from_msec(100)
+static volatile apr_time_t queues_next_expiry;  /* next expiry time accross all queues */
 
 /*
  * Macros for accessing struct timeout_queue.
  * For TO_QUEUE_APPEND and TO_QUEUE_REMOVE, timeout_mutex must be held.
  */
-static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *el)
+static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *cs)
 {
     apr_time_t elem_expiry;
     apr_time_t next_expiry;
 
-    APR_RING_INSERT_TAIL(&q->head, el, event_conn_state_t, timeout_list);
+    ap_assert(q && !cs->q);
+
+    cs->q = q;
+    cs->queue_timestamp = apr_time_now();
+    APR_RING_INSERT_TAIL(&q->head, cs, event_conn_state_t, timeout_list);
     ++*q->total;
     ++q->count;
 
     /* Cheaply update the global queues_next_expiry with the one of the
      * first entry of this queue (oldest) if it expires before.
      */
-    el = APR_RING_FIRST(&q->head);
-    elem_expiry = el->queue_timestamp + q->timeout;
+    cs = APR_RING_FIRST(&q->head);
+    elem_expiry = cs->queue_timestamp + q->timeout;
     next_expiry = queues_next_expiry;
-    if (!next_expiry || next_expiry > elem_expiry + TIMEOUT_FUDGE_FACTOR) {
+    if (!next_expiry || next_expiry > elem_expiry + QUEUES_FUDGE_TIMEOUT) {
         queues_next_expiry = elem_expiry;
         /* Unblock the poll()ing listener for it to update its timeout. */
         if (listener_is_wakeable) {
@@ -320,29 +395,51 @@ static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *el)
     }
 }
 
-static void TO_QUEUE_REMOVE(struct timeout_queue *q, event_conn_state_t *el)
+static void TO_QUEUE_REMOVE(struct timeout_queue *q, event_conn_state_t *cs)
 {
-    APR_RING_REMOVE(el, timeout_list);
-    APR_RING_ELEM_INIT(el, timeout_list);
+    ap_assert(q && cs->q == q);
+    cs->q = NULL;
+
+    APR_RING_REMOVE(cs, timeout_list);
+    APR_RING_ELEM_INIT(cs, timeout_list);
     --*q->total;
     --q->count;
 }
 
-static struct timeout_queue *TO_QUEUE_MAKE(apr_pool_t *p, apr_time_t t,
+static struct timeout_queue *TO_QUEUE_MAKE(apr_pool_t *p,
+                                           const char *name,
+                                           apr_interval_time_t t,
                                            struct timeout_queue *ref)
 {
     struct timeout_queue *q;
-                                           
+
     q = apr_pcalloc(p, sizeof *q);
     APR_RING_INIT(&q->head, event_conn_state_t, timeout_list);
     q->total = (ref) ? ref->total : apr_pcalloc(p, sizeof *q->total);
     q->timeout = t;
+    q->name = name;
 
     return q;
 }
 
-#define TO_QUEUE_ELEM_INIT(el) \
-    APR_RING_ELEM_INIT((el), timeout_list)
+static struct timeout_queue *TO_QUEUE_CHAIN(apr_pool_t *p,
+                                            const char *name,
+                                            apr_interval_time_t t,
+                                            struct timeout_queue **ref,
+                                            apr_hash_t *ht, apr_pool_t *hp)
+{
+    struct timeout_queue *q = apr_hash_get(ht, &t, sizeof t);
+
+    if (!q) {
+        q = TO_QUEUE_MAKE(p, name, t, *ref);
+        q->next = *ref;
+        *ref = q;
+
+        apr_hash_set(ht, apr_pmemdup(hp, &t, sizeof t), sizeof t, q);
+    }
+
+    return q;
+}
 
 #if HAVE_SERF
 typedef struct {
@@ -454,6 +551,7 @@ static event_retained_data *retained;
 #endif
 
 struct event_srv_cfg_s {
+    /* Per server timeout queues */
     struct timeout_queue *io_q,
                          *wc_q,
                          *ka_q;
@@ -512,37 +610,59 @@ static void disable_listensocks(void)
     if (apr_atomic_cas32(&listensocks_disabled, 1, 0) != 0) {
         return;
     }
-    if (event_pollset) {
-        for (i = 0; i < num_listensocks; i++) {
-            apr_pollset_remove(event_pollset, &listener_pollfd[i]);
-        }
-    }
+
+    ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(10381)
+                 "Suspend listening sockets: idlers:%i conns:%u "
+                 "waitio:%u write:%u keepalive:%u linger:%u/%u "
+                 "suspended:%u clogged:%u",
+                 ap_queue_info_num_idlers(worker_queue_info),
+                 apr_atomic_read32(&connection_count),
+                 apr_atomic_read32(waitio_q->total),
+                 apr_atomic_read32(write_completion_q->total),
+                 apr_atomic_read32(keepalive_q->total),
+                 apr_atomic_read32(linger_q->total),
+                 apr_atomic_read32(short_linger_q->total),
+                 apr_atomic_read32(&suspended_count),
+                 apr_atomic_read32(&clogged_count));
+
     ap_scoreboard_image->parent[ap_child_slot].not_accepting = 1;
+
+    for (i = 0; i < num_listensocks; i++) {
+        apr_pollset_remove(event_pollset, &listener_pollfd[i]);
+    }
 }
 
 static void enable_listensocks(void)
 {
     int i;
     if (listener_may_exit
-            || apr_atomic_cas32(&listensocks_disabled, 0, 1) != 1) {
+        || apr_atomic_cas32(&listensocks_disabled, 0, 1) != 1) {
         return;
     }
-    ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, APLOGNO(00457)
-                 "Accepting new connections again: "
-                 "%u active conns (%u lingering/%u clogged/%u suspended), "
-                 "%u idle workers",
+
+    ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(00457)
+                 "Resume listening sockets: idlers:%i conns:%u "
+                 "waitio:%u write:%u keepalive:%u linger:%u/%u "
+                 "suspended:%u clogged:%u",
+                 ap_queue_info_num_idlers(worker_queue_info),
                  apr_atomic_read32(&connection_count),
-                 apr_atomic_read32(&lingering_count),
-                 apr_atomic_read32(&clogged_count),
+                 apr_atomic_read32(waitio_q->total),
+                 apr_atomic_read32(write_completion_q->total),
+                 apr_atomic_read32(keepalive_q->total),
+                 apr_atomic_read32(linger_q->total),
+                 apr_atomic_read32(short_linger_q->total),
                  apr_atomic_read32(&suspended_count),
-                 ap_queue_info_num_idlers(worker_queue_info));
-    for (i = 0; i < num_listensocks; i++)
-        apr_pollset_add(event_pollset, &listener_pollfd[i]);
+                 apr_atomic_read32(&clogged_count));
+
     /*
      * XXX: This is not yet optimal. If many workers suddenly become available,
      * XXX: the parent may kill some processes off too soon.
      */
     ap_scoreboard_image->parent[ap_child_slot].not_accepting = 0;
+
+    for (i = 0; i < num_listensocks; i++) {
+        apr_pollset_add(event_pollset, &listener_pollfd[i]);
+    }
 }
 
 static APR_INLINE apr_uint32_t listeners_disabled(void)
@@ -575,21 +695,23 @@ static APR_INLINE int should_enable_listensocks(void)
     return !dying && listeners_disabled() && !connections_above_limit(NULL);
 }
 
-static void close_socket_nonblocking_(apr_socket_t *csd,
-                                      const char *from, int line)
+static void close_socket_at(apr_socket_t *csd,
+                            const char *at, int line)
 {
-    apr_status_t rv;
     apr_os_sock_t fd = -1;
+    apr_status_t rv = apr_os_sock_get(&fd, csd);
 
     /* close_worker_sockets() may have closed it already */
-    rv = apr_os_sock_get(&fd, csd);
-    ap_log_error(APLOG_MARK, APLOG_TRACE8, 0, ap_server_conf,
-                "closing socket %i/%pp from %s:%i", (int)fd, csd, from, line);
     if (rv == APR_SUCCESS && fd == -1) {
+        ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+                     "dead socket %pp at %s:%i", csd, at, line);
         return;
     }
 
-    apr_socket_timeout_set(csd, 0);
+    ap_log_error(APLOG_MARK, APLOG_TRACE7, rv, ap_server_conf,
+                "closing socket %pp:%i at %s:%i", csd, (int)fd, at, line);
+
+    apr_socket_opt_set(csd, APR_SO_NONBLOCK, 1);
     rv = apr_socket_close(csd);
     if (rv != APR_SUCCESS) {
         ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(00468)
@@ -597,8 +719,8 @@ static void close_socket_nonblocking_(apr_socket_t *csd,
         AP_DEBUG_ASSERT(0);
     }
 }
-#define close_socket_nonblocking(csd) \
-    close_socket_nonblocking_(csd, __FUNCTION__, __LINE__)
+#define close_socket(csd) \
+    close_socket_at(csd, __FUNCTION__, __LINE__)
 
 static void close_worker_sockets(void)
 {
@@ -607,15 +729,16 @@ static void close_worker_sockets(void)
         apr_socket_t *csd = worker_sockets[i];
         if (csd) {
             worker_sockets[i] = NULL;
-            close_socket_nonblocking(csd);
+            close_socket(csd);
         }
     }
 }
 
-static void wakeup_listener(void)
+static void shutdown_listener(void)
 {
     ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf,
-                 "wake up listener%s", listener_may_exit ? " again" : "");
+                 "shutting down listener%s",
+                 listener_may_exit ? " again" : "");
 
     listener_may_exit = 1;
     disable_listensocks();
@@ -667,7 +790,7 @@ static void signal_threads(int mode)
     /* in case we weren't called from the listener thread, wake up the
      * listener thread
      */
-    wakeup_listener();
+    shutdown_listener();
 
     /* for ungraceful termination, let the workers exit now;
      * for graceful termination, the listener thread will notify the
@@ -841,8 +964,10 @@ static apr_status_t decrement_connection_count(void *cs_)
 {
     int is_last_connection;
     event_conn_state_t *cs = cs_;
-    ap_log_cerror(APLOG_MARK, APLOG_TRACE8, 0, cs->c,
-                  "cleanup connection from state %i", (int)cs->pub.state);
+    ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf,
+                 "connection %" CS_FMT_TO " cleaned up",
+                 CS_ARG_TO(cs));
+
     switch (cs->pub.state) {
         case CONN_STATE_LINGER:
         case CONN_STATE_LINGER_NORMAL:
@@ -861,8 +986,8 @@ static apr_status_t decrement_connection_count(void *cs_)
      */
     is_last_connection = !apr_atomic_dec32(&connection_count);
     if (listener_is_wakeable
-            && ((is_last_connection && listener_may_exit)
-                || should_enable_listensocks())) {
+        && ((is_last_connection && listener_may_exit)
+            || should_enable_listensocks())) {
         apr_pollset_wakeup(event_pollset);
     }
     if (dying) {
@@ -895,7 +1020,7 @@ static void notify_resume(event_conn_state_t *cs, int cleanup)
 static int defer_lingering_close(event_conn_state_t *cs)
 {
     ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
-                  "deferring close from state %i", (int)cs->pub.state);
+                  "deferring close for connection %" CS_FMT, CS_ARG(cs));
 
     /* The connection is not shutdown() yet strictly speaking, but it's not
      * in any queue nor handled by a worker either (will be very soon), so
@@ -922,14 +1047,28 @@ static int defer_lingering_close(event_conn_state_t *cs)
  * Pre-condition: nonblocking, can be called from anywhere provided cs is not
  *                in any timeout queue or in the pollset.
  */
-static void close_connection(event_conn_state_t *cs)
+static void close_connection_at(event_conn_state_t *cs,
+                                const char *at, int line)
 {
-    ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
-                  "closing connection from state %i", (int)cs->pub.state);
+    if (cs->c) {
+        ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
+                      "closing connection %" CS_FMT " at %s:%i",
+                      CS_ARG(cs), at, line);
+    }
+    else {
+        ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf,
+                      "closing connection %" CS_FMT_TO " at %s:%i",
+                      CS_ARG_TO(cs), at, line);
+    }
 
-    close_socket_nonblocking(cs->pfd.desc.s);
+    close_socket_at(cs_sd(cs), at, line);
     ap_queue_info_push_pool(worker_queue_info, cs->p);
 }
+#define close_connection(cs) \
+    close_connection_at((cs), __FUNCTION__, __LINE__)
+
+/* forward declare */
+static void set_conn_state_sense(event_conn_state_t *cs, int sense);
 
 /* Shutdown the connection in case of timeout, error or resources shortage.
  * This starts short lingering close if not already there, or directly closes
@@ -1015,11 +1154,145 @@ static int event_post_read_request(request_rec *r)
     return OK;
 }
 
+static int pollset_add_at(event_conn_state_t *cs, int sense,
+                          struct timeout_queue *q,
+                          const char *at, int line)
+{
+    apr_status_t rv;
+
+    ap_log_cerror(APLOG_MARK, APLOG_TRACE7, 0, cs->c,
+                  "pollset: add %s=%" APR_TIME_T_FMT " events=%x"
+                  " for connection %" CS_FMT " at %s:%i",
+                  (q) ? "q" : "t",
+                  (q) ? q->timeout : -1,
+                  (int)cs->pfd.reqevents,
+                  CS_ARG(cs), at, line);
+
+    ap_assert(cs->q == NULL && q != NULL);
+
+    set_conn_state_sense(cs, sense);
+
+    if (q) {
+        apr_thread_mutex_lock(timeout_mutex);
+        TO_QUEUE_APPEND(q, cs);
+    }
+    rv = apr_pollset_add(event_pollset, &cs->pfd);
+    if (rv != APR_SUCCESS) {
+        if (q) {
+            TO_QUEUE_REMOVE(q, cs);
+            apr_thread_mutex_unlock(timeout_mutex);
+        }
+
+        /* close_worker_sockets() may have closed it already */
+        if (workers_may_exit) {
+            AP_DEBUG_ASSERT(APR_STATUS_IS_EBADF(rv));
+        }
+        else {
+            ap_log_cerror(APLOG_MARK, APLOG_ERR, rv, cs->c, APLOGNO(03093)
+                          "pollset add failed for connection %" CS_FMT " at %s:%i",
+                          CS_ARG(cs), at, line);
+            AP_DEBUG_ASSERT(0);
+            signal_threads(ST_GRACEFUL);
+        }
+        return 0;
+    }
+    if (q) {
+        apr_thread_mutex_unlock(timeout_mutex);
+    }
+    return 1;
+}
+#define pollset_add(cs, sense, q) \
+    pollset_add_at((cs), (sense), (q), __FUNCTION__, __LINE__)
+
+static int pollset_del_at(event_conn_state_t *cs, int locked,
+                          const char *at, int line)
+{
+    apr_status_t rv;
+
+    ap_log_cerror(APLOG_MARK, APLOG_TRACE7, 0, cs->c,
+                  "pollset: del %s=%" APR_TIME_T_FMT " events=%x"
+                  " for connection %" CS_FMT " at %s:%i",
+                  (cs->q) ? "q" : "t",
+                  (cs->q) ? cs->q->timeout : -1,
+                  (int)cs->pfd.reqevents,
+                  CS_ARG(cs), at, line);
+
+    ap_assert(cs->q != NULL);
+
+    if (cs->q) {
+        if (!locked) {
+            apr_thread_mutex_lock(timeout_mutex);
+        }
+        TO_QUEUE_REMOVE(cs->q, cs);
+        if (!locked) {
+            apr_thread_mutex_unlock(timeout_mutex);
+        }
+    }
+
+    /*
+     * Some of the pollset backends, like KQueue or Epoll
+     * automagically remove the FD if the socket is closed,
+     * therefore, we can accept _SUCCESS or _NOTFOUND,
+     * and we still want to keep going
+     */
+    rv = apr_pollset_remove(event_pollset, &cs->pfd);
+    if (rv != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rv)) {
+        ap_log_cerror(APLOG_MARK, APLOG_ERR, rv, cs->c, APLOGNO(03094)
+                      "pollset remove failed for connection %" CS_FMT " at %s:%i",
+                      CS_ARG(cs), at, line);
+        AP_DEBUG_ASSERT(0);
+        signal_threads(ST_GRACEFUL);
+        return 0;
+    }
+
+    return 1;
+}
+#define pollset_del(cs, locked) \
+    pollset_del_at((cs), (locked), __FUNCTION__, __LINE__)
+
 /* Forward declare */
 static void process_lingering_close(event_conn_state_t *cs);
 
-static void update_reqevents_from_sense(event_conn_state_t *cs,
-                                        int default_sense)
+static event_conn_state_t *make_conn_state(apr_pool_t *p, apr_socket_t *csd)
+{
+    event_conn_state_t *cs = apr_pcalloc(p, sizeof(*cs));
+    listener_poll_type *pt;
+
+    cs->p = p;
+    cs->pfd.desc.s = csd;
+    cs->pfd.desc_type = APR_POLL_SOCKET;
+    cs->pfd.client_data = pt = apr_pcalloc(p, sizeof(*pt));
+    pt->type = PT_CSD;
+    pt->baton = cs;
+
+    APR_RING_ELEM_INIT(cs, timeout_list);
+
+    cs->sc = ap_get_module_config(ap_server_conf->module_config,
+                                  &mpm_event_module);
+
+    /**
+     * XXX If the platform does not have a usable way of bundling
+     * accept() with a socket readability check, like Win32,
+     * and there are measurable delays before the
+     * socket is readable due to the first data packet arriving,
+     * it might be better to create the cs on the listener thread
+     * with the state set to CONN_STATE_KEEPALIVE
+     *
+     * FreeBSD users will want to enable the HTTP accept filter
+     * module in their kernel for the highest performance
+     * When the accept filter is active, sockets are kept in the
+     * kernel until a HTTP request is received.
+     */
+    cs->pub.state = CONN_STATE_PROCESSING;
+    cs->pub.sense = CONN_SENSE_DEFAULT;
+
+    apr_atomic_inc32(&connection_count);
+    apr_pool_cleanup_register(p, cs, decrement_connection_count,
+                              apr_pool_cleanup_null);
+    return cs;
+}
+
+static void set_conn_state_sense(event_conn_state_t *cs, int default_sense)
 {
     int sense = default_sense;
 
@@ -1046,80 +1319,51 @@ static void update_reqevents_from_sense(event_conn_state_t *cs,
 /*
  * process one connection in the worker
  */
-static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * sock,
-                          event_conn_state_t * cs, int my_child_num,
-                          int my_thread_num)
+static void process_socket(apr_thread_t *thd, apr_pool_t *p,
+                           apr_socket_t *sock, event_conn_state_t *cs,
+                           int my_child_num, int my_thread_num)
 {
-    conn_rec *c;
+    conn_rec *c = cs->c;
     long conn_id = ID_FROM_CHILD_THREAD(my_child_num, my_thread_num);
-    int clogging = 0, from_wc_q = 0;
-    apr_status_t rv;
-    int rc = OK;
+    int rc = OK, processed = 0, clogging;
 
-    if (cs == NULL) {           /* This is a new connection */
-        listener_poll_type *pt = apr_pcalloc(p, sizeof(*pt));
-        cs = apr_pcalloc(p, sizeof(event_conn_state_t));
+    if (!c) { /* This is a new connection */
         cs->bucket_alloc = apr_bucket_alloc_create(p);
         ap_create_sb_handle(&cs->sbh, p, my_child_num, my_thread_num);
-        c = ap_run_create_connection(p, ap_server_conf, sock,
-                                     conn_id, cs->sbh, cs->bucket_alloc);
+        cs->c = c = ap_run_create_connection(p, ap_server_conf, sock, conn_id,
+                                             cs->sbh, cs->bucket_alloc);
         if (!c) {
             ap_queue_info_push_pool(worker_queue_info, p);
             return;
         }
-        apr_atomic_inc32(&connection_count);
-        apr_pool_cleanup_register(c->pool, cs, decrement_connection_count,
-                                  apr_pool_cleanup_null);
+        apr_pool_pre_cleanup_register(p, cs, ptrans_pre_cleanup);
         ap_set_module_config(c->conn_config, &mpm_event_module, cs);
         c->current_thread = thd;
         c->cs = &cs->pub;
-        cs->c = c;
-        cs->p = p;
-        cs->sc = ap_get_module_config(ap_server_conf->module_config,
-                                      &mpm_event_module);
-        cs->pfd.desc_type = APR_POLL_SOCKET;
-        cs->pfd.desc.s = sock;
-        pt->type = PT_CSD;
-        pt->baton = cs;
-        cs->pfd.client_data = pt;
-        apr_pool_pre_cleanup_register(p, cs, ptrans_pre_cleanup);
-        TO_QUEUE_ELEM_INIT(cs);
 
         ap_update_vhost_given_ip(c);
-
         rc = ap_pre_connection(c, sock);
         if (rc != OK && rc != DONE) {
             ap_log_cerror(APLOG_MARK, APLOG_DEBUG, 0, c, APLOGNO(00469)
-                          "process_socket: connection aborted");
+                          "process_socket: connection aborted (%d)", rc);
             close_connection(cs);
             return;
         }
 
-        /**
-         * XXX If the platform does not have a usable way of bundling
-         * accept() with a socket readability check, like Win32,
-         * and there are measurable delays before the
-         * socket is readable due to the first data packet arriving,
-         * it might be better to create the cs on the listener thread
-         * with the state set to CONN_STATE_KEEPALIVE
-         *
-         * FreeBSD users will want to enable the HTTP accept filter
-         * module in their kernel for the highest performance
-         * When the accept filter is active, sockets are kept in the
-         * kernel until a HTTP request is received.
-         */
-        cs->pub.state = CONN_STATE_PROCESSING;
         cs->pub.sense = CONN_SENSE_DEFAULT;
     }
-    else {
+    else { /* The connection is scheduled back */
         c = cs->c;
+        c->current_thread = thd;
+        c->id = conn_id; /* thread number is part of ID */
         ap_update_sb_handle(cs->sbh, my_child_num, my_thread_num);
         notify_resume(cs, 0);
-        c->current_thread = thd;
-        /* Subsequent request on a conn, and thread number is part of ID */
-        c->id = conn_id;
     }
 
+    ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
+                  "processing connection %" CS_FMT " (aborted %d, clogging %d)",
+                  CS_ARG(cs), c->aborted, c->clogging_input_filters);
+
     if (CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) {
         goto lingering_close;
     }
@@ -1133,8 +1377,8 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc
          */
          || c->clogging_input_filters) {
  process_connection:
+        processed = 1;
         cs->pub.state = CONN_STATE_PROCESSING;
-
         clogging = c->clogging_input_filters;
         if (clogging) {
             apr_atomic_inc32(&clogged_count);
@@ -1197,40 +1441,24 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc
             goto lingering_close;
         }
     }
-    else if (cs->pub.state == CONN_STATE_WRITE_COMPLETION) {
-        from_wc_q = 1;
-    }
 
     if (cs->pub.state == CONN_STATE_ASYNC_WAITIO) {
         /* Set a read/write timeout for this connection, and let the
          * event thread poll for read/writeability.
          */
-        cs->queue_timestamp = apr_time_now();
-        notify_suspend(cs);
-
         ap_update_child_status(cs->sbh, SERVER_BUSY_READ, NULL);
+        notify_suspend(cs);
 
         /* Modules might set c->cs->sense to CONN_SENSE_WANT_WRITE,
          * the default is CONN_SENSE_WANT_READ still.
          */
-        update_reqevents_from_sense(cs, CONN_SENSE_WANT_READ);
-        apr_thread_mutex_lock(timeout_mutex);
-        TO_QUEUE_APPEND(cs->sc->io_q, cs);
-        rv = apr_pollset_add(event_pollset, &cs->pfd);
-        if (rv != APR_SUCCESS && !APR_STATUS_IS_EEXIST(rv)) {
-            AP_DEBUG_ASSERT(0);
-            TO_QUEUE_REMOVE(cs->sc->io_q, cs);
-            apr_thread_mutex_unlock(timeout_mutex);
-            ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(10503)
-                         "process_socket: apr_pollset_add failure in "
-                         "CONN_STATE_ASYNC_WAITIO");
-            close_connection(cs);
-            signal_threads(ST_GRACEFUL);
-        }
-        else {
-            apr_thread_mutex_unlock(timeout_mutex);
+        if (pollset_add(cs, CONN_SENSE_WANT_READ, cs->sc->io_q)) {
+            apr_table_setn(cs->c->notes, "short-lingering-close", "1");
+            cs->pub.state = CONN_STATE_LINGER;
+            goto lingering_close;
         }
-        return;
+
+        return; /* queued */
     }
 
     if (cs->pub.state == CONN_STATE_WRITE_COMPLETION) {
@@ -1239,11 +1467,9 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc
         /* Flush all pending outputs before going to CONN_STATE_KEEPALIVE or
          * straight to CONN_STATE_PROCESSING if inputs are pending already.
          */
-        
         ap_update_child_status(cs->sbh, SERVER_BUSY_WRITE, NULL);
 
-        if (from_wc_q) {
-            from_wc_q = 0; /* one shot */
+        if (!processed) {
             pending = ap_check_output_pending(c);
         }
         else if (ap_filter_should_yield(c->output_filters)) {
@@ -1251,38 +1477,24 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc
         }
         if (pending == AGAIN) {
             /* Let the event thread poll for write */
-            cs->queue_timestamp = apr_time_now();
             notify_suspend(cs);
-
-            /* Add work to pollset. */
             cs->pub.sense = CONN_SENSE_DEFAULT;
-            update_reqevents_from_sense(cs, CONN_SENSE_WANT_WRITE);
-            apr_thread_mutex_lock(timeout_mutex);
-            TO_QUEUE_APPEND(cs->sc->wc_q, cs);
-            rv = apr_pollset_add(event_pollset, &cs->pfd);
-            if (rv != APR_SUCCESS && !APR_STATUS_IS_EEXIST(rv)) {
-                AP_DEBUG_ASSERT(0);
-                TO_QUEUE_REMOVE(cs->sc->wc_q, cs);
-                apr_thread_mutex_unlock(timeout_mutex);
-                ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03465)
-                             "process_socket: apr_pollset_add failure in "
-                             "CONN_STATE_WRITE_COMPLETION");
-                close_connection(cs);
-                signal_threads(ST_GRACEFUL);
-            }
-            else {
-                apr_thread_mutex_unlock(timeout_mutex);
+            if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q)) {
+                return; /* queued */
             }
-            return;
+            /* Fall through lingering close */
+            apr_table_setn(cs->c->notes, "short-lingering-close", "1");
         }
-        if (pending != OK || c->aborted || c->keepalive != AP_CONN_KEEPALIVE) {
-            cs->pub.state = CONN_STATE_LINGER;
-            goto lingering_close;
-        }
-        if (ap_check_input_pending(c) == AGAIN) {
-            goto process_connection;
+        else if (pending == OK) {
+            /* Some data to process immediately? */
+            pending = (c->keepalive == AP_CONN_KEEPALIVE
+                       ? ap_check_input_pending(c)
+                       : DONE);
+            if (pending == AGAIN) {
+                goto process_connection;
+            }
         }
-        if (listener_may_exit) {
+        if (pending != OK || listener_may_exit) {
             cs->pub.state = CONN_STATE_LINGER;
             goto lingering_close;
         }
@@ -1302,40 +1514,25 @@ static void process_socket(apr_thread_t *thd, apr_pool_t * p, apr_socket_t * soc
          * timeout today.  With a normal client, the socket will be readable in
          * a few milliseconds anyway.
          */
-        cs->queue_timestamp = apr_time_now();
         notify_suspend(cs);
 
-        /* Add work to pollset. */
-        cs->pub.sense = CONN_SENSE_DEFAULT;
-        update_reqevents_from_sense(cs, CONN_SENSE_WANT_READ);
-        apr_thread_mutex_lock(timeout_mutex);
-        TO_QUEUE_APPEND(cs->sc->ka_q, cs);
-        rv = apr_pollset_add(event_pollset, &cs->pfd);
-        if (rv != APR_SUCCESS && !APR_STATUS_IS_EEXIST(rv)) {
-            AP_DEBUG_ASSERT(0);
-            TO_QUEUE_REMOVE(cs->sc->ka_q, cs);
-            apr_thread_mutex_unlock(timeout_mutex);
-            ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03093)
-                         "process_socket: apr_pollset_add failure for "
-                         "keep alive");
-            close_connection(cs);
-            signal_threads(ST_GRACEFUL);
-        }
-        else {
-            apr_thread_mutex_unlock(timeout_mutex);
+        if (!pollset_add(cs, CONN_SENSE_WANT_READ, cs->sc->ka_q)) {
+            apr_table_setn(cs->c->notes, "short-lingering-close", "1");
+            cs->pub.state = CONN_STATE_LINGER;
+            goto lingering_close;
         }
-        return;
+
+        return; /* queued */
     }
 
     if (cs->pub.state == CONN_STATE_SUSPENDED) {
         cs->c->suspended_baton = cs;
         apr_atomic_inc32(&suspended_count);
         notify_suspend(cs);
-        return;
+        return; /* done */
     }
 
  lingering_close:
-    /* CONN_STATE_LINGER[_*] fall through process_lingering_close() */
     process_lingering_close(cs);
 }
 
@@ -1347,31 +1544,29 @@ static apr_status_t event_resume_suspended (conn_rec *c)
         ap_log_cerror (APLOG_MARK, LOG_WARNING, 0, c, APLOGNO(02615)
                 "event_resume_suspended: suspended_baton is NULL");
         return APR_EGENERAL;
-    } else if (!cs->suspended) {
+    }
+    if (!cs->suspended) {
         ap_log_cerror (APLOG_MARK, LOG_WARNING, 0, c, APLOGNO(02616)
                 "event_resume_suspended: Thread isn't suspended");
         return APR_EGENERAL;
     }
+
     apr_atomic_dec32(&suspended_count);
     c->suspended_baton = NULL;
 
+    cs->pub.sense = CONN_SENSE_DEFAULT;
     if (!CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) {
-        cs->queue_timestamp = apr_time_now();
-        notify_suspend(cs);
-
-        cs->pub.sense = CONN_SENSE_DEFAULT;
         cs->pub.state = CONN_STATE_WRITE_COMPLETION;
-        update_reqevents_from_sense(cs, CONN_SENSE_WANT_WRITE);
-        apr_thread_mutex_lock(timeout_mutex);
-        TO_QUEUE_APPEND(cs->sc->wc_q, cs);
-        apr_pollset_add(event_pollset, &cs->pfd);
-        apr_thread_mutex_unlock(timeout_mutex);
-    }
-    else {
-        process_lingering_close(cs);
-    }
+        if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q)) {
+            return APR_SUCCESS; /* queued */
+        }
 
-    return OK;
+        /* fall through lingering close on error */
+        apr_table_setn(cs->c->notes, "short-lingering-close", "1");
+    }
+    cs->pub.state = CONN_STATE_LINGER;
+    process_lingering_close(cs);
+    return APR_SUCCESS;
 }
 
 /* conns_this_child has gone to zero or below.  See if the admin coded
@@ -1388,36 +1583,31 @@ static void check_infinite_requests(void)
     conns_this_child = APR_INT32_MAX;
 }
 
-static int close_listeners(int *closed)
+static void set_child_dying(void)
 {
-    if (!*closed) {
-        int i;
+    ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, "quiescing");
 
-        ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf,
-                     "closing listeners (connection_count=%u)",
-                     apr_atomic_read32(&connection_count));
-        ap_close_listeners_ex(my_bucket->listeners);
+    dying = 1;
+    ap_scoreboard_image->parent[ap_child_slot].quiescing = 1;
+    ap_close_listeners_ex(my_bucket->listeners);
 
-        dying = 1;
-        ap_scoreboard_image->parent[ap_child_slot].quiescing = 1;
+#if 0
+    {
+        int i;
         for (i = 0; i < threads_per_child; ++i) {
             ap_update_child_status_from_indexes(ap_child_slot, i,
                                                 SERVER_GRACEFUL, NULL);
         }
-        /* wake up the main thread */
-        kill(ap_my_pid, SIGTERM);
-
-        ap_queue_info_free_idle_pools(worker_queue_info);
-        ap_queue_interrupt_all(worker_queue);
-
-        *closed = 1; /* once */
-        return 1;
     }
+#endif
 
-    ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf,
-                 "closed listeners (connection_count=%u)",
-                 apr_atomic_read32(&connection_count));
-    return 0;
+    /* wake up idle worker threads */
+    ap_queue_interrupt_all(worker_queue);
+    /* wake up the main thread */
+    kill(ap_my_pid, SIGTERM);
+
+    /* No new connections will use the idle pools */
+    ap_queue_info_free_idle_pools(worker_queue_info);
 }
 
 static void unblock_signal(int sig)
@@ -1500,9 +1690,10 @@ static apr_status_t push2worker(event_conn_state_t *cs, apr_socket_t *csd,
     apr_status_t rc;
 
     if (cs) {
-        csd = cs->pfd.desc.s;
         ptrans = cs->p;
+        csd = cs_sd(cs);
     }
+
     rc = ap_queue_push_socket(worker_queue, csd, cs, ptrans);
     if (rc != APR_SUCCESS) {
         ap_log_error(APLOG_MARK, APLOG_CRIT, rc, ap_server_conf, APLOGNO(00471)
@@ -1515,7 +1706,7 @@ static apr_status_t push2worker(event_conn_state_t *cs, apr_socket_t *csd,
         }
         else {
             if (csd) {
-                close_socket_nonblocking(csd);
+                close_socket(csd);
             }
             if (ptrans) {
                 ap_queue_info_push_pool(worker_queue_info, ptrans);
@@ -1572,43 +1763,30 @@ static timer_event_t timer_free_ring;
 static apr_skiplist *timer_skiplist;
 static volatile apr_time_t timers_next_expiry;
 
-/* Same goal as for TIMEOUT_FUDGE_FACTOR (avoid extra poll calls), but applied
- * to timers. Since their timeouts are custom (user defined), we can't be too
- * approximative here (hence using 0.01s).
- */
-#define EVENT_FUDGE_FACTOR apr_time_from_msec(10)
-
-/* The following compare function is used by apr_skiplist_insert() to keep the
- * elements (timers) sorted and provide O(log n) complexity (this is also true
- * for apr_skiplist_{find,remove}(), but those are not used in MPM event where
- * inserted timers are not searched nor removed, but with apr_skiplist_pop()
- * which does use any compare function).  It is meant to return 0 when a == b,
- * <0 when a < b, and >0 when a > b.  However apr_skiplist_insert() will not
- * add duplicates (i.e. a == b), and apr_skiplist_add() is only available in
- * APR 1.6, yet multiple timers could possibly be created in the same micro-
- * second (duplicates with regard to apr_time_t); therefore we implement the
- * compare function to return +1 instead of 0 when compared timers are equal,
- * thus duplicates are still added after each other (in order of insertion).
+/* The timer_comp() function is used by apr_skiplist_insert() to keep the
+ * elements/timers sorted, but it should never return 0 because inserting
+ * duplicates is not possible (apr_skiplist_add() would allow this but it's
+ * not available before APR 1.6). Thus duplicates are sorted by order of
+ * insertion and timers are never equal for the skiplist (not an issue
+ * because MPM event does not use apr_skiplist_{find,remove}() but
+ * apr_skiplist_pop() only).
  */
 static int timer_comp(void *a, void *b)
 {
-    apr_time_t t1 = (apr_time_t) ((timer_event_t *)a)->when;
-    apr_time_t t2 = (apr_time_t) ((timer_event_t *)b)->when;
-    AP_DEBUG_ASSERT(t1);
-    AP_DEBUG_ASSERT(t2);
-    return ((t1 < t2) ? -1 : 1);
+    const timer_event_t *ta = a, *tb = b;
+    return (ta->when < tb->when) ? -1 : 1;
 }
 
 static apr_thread_mutex_t *g_timer_skiplist_mtx;
 
-static timer_event_t * event_get_timer_event(apr_time_t t,
-                                             ap_mpm_callback_fn_t *cbfn,
-                                             void *baton,
-                                             int insert, 
-                                             apr_array_header_t *pfds)
+static timer_event_t *get_timer_event(apr_time_t timeout,
+                                      ap_mpm_callback_fn_t *cbfn,
+                                      void *baton,
+                                      int insert,
+                                      apr_array_header_t *pfds)
 {
     timer_event_t *te;
-    apr_time_t now = (t < 0) ? 0 : apr_time_now();
+    apr_time_t now = (timeout < 0) ? 0 : apr_time_now();
 
     /* oh yeah, and make locking smarter/fine grained. */
 
@@ -1620,16 +1798,16 @@ static timer_event_t * event_get_timer_event(apr_time_t t,
     }
     else {
         te = apr_skiplist_alloc(timer_skiplist, sizeof(timer_event_t));
-        APR_RING_ELEM_INIT(te, link);
+        memset(te, 0, sizeof(*te));
     }
 
+    APR_RING_ELEM_INIT(te, link);
     te->cbfunc = cbfn;
     te->baton = baton;
-    te->canceled = 0;
-    te->when = now + t;
+    te->when = now + timeout;
     te->pfds = pfds;
 
-    if (insert) { 
+    if (insert) {
         apr_time_t next_expiry;
 
         /* Okay, add sorted by when.. */
@@ -1639,33 +1817,51 @@ static timer_event_t * event_get_timer_event(apr_time_t t,
          * if it expires before.
          */
         next_expiry = timers_next_expiry;
-        if (!next_expiry || next_expiry > te->when + EVENT_FUDGE_FACTOR) {
+        if (!next_expiry || next_expiry > te->when + TIMERS_FUDGE_TIMEOUT) {
             timers_next_expiry = te->when;
-            /* Unblock the poll()ing listener for it to update its timeout. */
+            /* Wake up the listener to eventually update its poll()ing timeout. */
             if (listener_is_wakeable) {
                 apr_pollset_wakeup(event_pollset);
             }
         }
     }
+
     apr_thread_mutex_unlock(g_timer_skiplist_mtx);
 
     return te;
 }
 
-static apr_status_t event_register_timed_callback_ex(apr_time_t t,
+static void put_timer_event(timer_event_t *te, int locked)
+{
+    if (!locked) {
+        apr_thread_mutex_lock(g_timer_skiplist_mtx);
+    }
+
+    memset(te, 0, sizeof(*te));
+    APR_RING_INSERT_TAIL(&timer_free_ring.link, te, timer_event_t, link);
+
+    if (!locked) {
+        apr_thread_mutex_unlock(g_timer_skiplist_mtx);
+    }
+}
+
+static apr_status_t event_register_timed_callback_ex(apr_time_t timeout,
                                                   ap_mpm_callback_fn_t *cbfn,
-                                                  void *baton, 
+                                                  void *baton,
                                                   apr_array_header_t *pfds)
 {
-    event_get_timer_event(t, cbfn, baton, 1, pfds);
+    if (!cbfn) {
+        return APR_EINVAL;
+    }
+    get_timer_event(timeout, cbfn, baton, 1, pfds);
     return APR_SUCCESS;
 }
 
-static apr_status_t event_register_timed_callback(apr_time_t t,
+static apr_status_t event_register_timed_callback(apr_time_t timeout,
                                                   ap_mpm_callback_fn_t *cbfn,
                                                   void *baton)
 {
-    event_register_timed_callback_ex(t, cbfn, baton, NULL);
+    event_register_timed_callback_ex(timeout, cbfn, baton, NULL);
     return APR_SUCCESS;
 }
 
@@ -1687,6 +1883,10 @@ static apr_status_t event_cleanup_poll_callback(void *data)
         }
     }
 
+    if (final_rc) {
+        AP_DEBUG_ASSERT(0);
+        signal_threads(ST_GRACEFUL);
+    }
     return final_rc;
 }
 
@@ -1697,18 +1897,24 @@ static apr_status_t event_register_poll_callback_ex(apr_pool_t *p,
                                                 void *baton,
                                                 apr_time_t timeout)
 {
-    socket_callback_baton_t *scb = apr_pcalloc(p, sizeof(*scb));
-    listener_poll_type *pt = apr_palloc(p, sizeof(*pt));
+    listener_poll_type *pt;
+    socket_callback_baton_t *scb;
     apr_status_t rc, final_rc = APR_SUCCESS;
     int i;
 
-    pt->type = PT_USER;
-    pt->baton = scb;
+    if (!cbfn || !tofn) {
+        return APR_EINVAL;
+    }
 
+    scb = apr_pcalloc(p, sizeof(*scb));
     scb->cbfunc = cbfn;
     scb->user_baton = baton;
     scb->pfds = apr_array_copy(p, pfds);
 
+    pt = apr_palloc(p, sizeof(*pt));
+    pt->type = PT_USER;
+    pt->baton = scb;
+
     apr_pool_pre_cleanup_register(p, scb->pfds, event_cleanup_poll_callback);
 
     for (i = 0; i < scb->pfds->nelts; i++) {
@@ -1725,9 +1931,12 @@ static apr_status_t event_register_poll_callback_ex(apr_pool_t *p,
         }
     }
 
-    if (timeout > 0) { 
-        /* XXX:  This cancel timer event can fire before the pollset is updated */
-        scb->cancel_event = event_get_timer_event(timeout, tofn, baton, 1, scb->pfds);
+    if (timeout > 0) {
+        /* Prevent the timer from firing before the pollset is updated */
+        if (timeout < TIMERS_FUDGE_TIMEOUT) {
+            timeout = TIMERS_FUDGE_TIMEOUT;
+        }
+        scb->cancel_event = get_timer_event(timeout, tofn, baton, 1, scb->pfds);
     }
     for (i = 0; i < scb->pfds->nelts; i++) {
         apr_pollfd_t *pfd = (apr_pollfd_t *)scb->pfds->elts + i;
@@ -1766,14 +1975,13 @@ static apr_status_t event_register_poll_callback(apr_pool_t *p,
 #define LINGERING_BUF_SIZE (32 * 1024)
 static void process_lingering_close(event_conn_state_t *cs)
 {
-    apr_socket_t *csd = ap_get_conn_socket(cs->c);
     char dummybuf[LINGERING_BUF_SIZE];
-    apr_size_t nbytes;
+    apr_socket_t *csd = cs_sd(cs);
     apr_status_t rv;
-    struct timeout_queue *q;
 
     ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
-                  "lingering close from state %i", (int)cs->pub.state);
+                  "lingering close for connection %" CS_FMT,
+                  CS_ARG(cs));
     AP_DEBUG_ASSERT(CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state));
 
     if (!cs->linger_started) {
@@ -1791,7 +1999,9 @@ static void process_lingering_close(event_conn_state_t *cs)
             close_connection(cs);
             return;
         }
-        
+
+        notify_suspend(cs);
+
         /* All nonblocking from now, no need for APR_INCOMPLETE_READ either */
         apr_socket_timeout_set(csd, 0);
         apr_socket_opt_set(csd, APR_INCOMPLETE_READ, 0);
@@ -1808,7 +2018,6 @@ static void process_lingering_close(event_conn_state_t *cs)
             cs->pub.state = CONN_STATE_LINGER_NORMAL;
         }
         cs->pub.sense = CONN_SENSE_DEFAULT;
-        notify_suspend(cs);
 
         /* One timestamp/duration for the whole lingering close time.
          * XXX: This makes the (short_)linger_q not sorted/ordered by expiring
@@ -1821,32 +2030,18 @@ static void process_lingering_close(event_conn_state_t *cs)
     }
 
     do {
-        nbytes = sizeof(dummybuf);
+        apr_size_t nbytes = sizeof(dummybuf);
         rv = apr_socket_recv(csd, dummybuf, &nbytes);
     } while (rv == APR_SUCCESS);
-
-    if (!APR_STATUS_IS_EAGAIN(rv)) {
-        close_connection(cs);
-        return;
-    }
-
-    /* (Re)queue the connection to come back when readable */
-    update_reqevents_from_sense(cs, CONN_SENSE_WANT_READ);
-    q = (cs->pub.state == CONN_STATE_LINGER_SHORT) ? short_linger_q : linger_q;
-    apr_thread_mutex_lock(timeout_mutex);
-    TO_QUEUE_APPEND(q, cs);
-    rv = apr_pollset_add(event_pollset, &cs->pfd);
-    if (rv != APR_SUCCESS && !APR_STATUS_IS_EEXIST(rv)) {
-        AP_DEBUG_ASSERT(0);
-        TO_QUEUE_REMOVE(q, cs);
-        apr_thread_mutex_unlock(timeout_mutex);
-        ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03092)
-                     "process_lingering_close: apr_pollset_add failure");
-        close_connection(cs);
-        signal_threads(ST_GRACEFUL);
-        return;
+    if (APR_STATUS_IS_EAGAIN(rv)) {
+        struct timeout_queue *q;
+        /* (Re)queue the connection to come back when readable */
+        q = (cs->pub.state == CONN_STATE_LINGER_SHORT) ? short_linger_q : linger_q;
+        if (pollset_add(cs, CONN_SENSE_WANT_READ, q)) {
+            return; /* queued */
+        }
     }
-    apr_thread_mutex_unlock(timeout_mutex);
+    close_connection(cs);
 }
 
 /* call 'func' for all elements of 'q' above 'expiry'.
@@ -1860,7 +2055,6 @@ static void process_timeout_queue(struct timeout_queue *q, apr_time_t expiry,
     event_conn_state_t *first, *cs, *last;
     struct event_conn_state_t trash;
     struct timeout_queue *qp;
-    apr_status_t rv;
 
     if (!*q->total) {
         return;
@@ -1891,19 +2085,29 @@ static void process_timeout_queue(struct timeout_queue *q, apr_time_t expiry,
                 apr_time_t elem_expiry = cs->queue_timestamp + qp->timeout;
                 apr_time_t next_expiry = queues_next_expiry;
                 if (!next_expiry
-                        || next_expiry > elem_expiry + TIMEOUT_FUDGE_FACTOR) {
+                        || next_expiry > elem_expiry + QUEUES_FUDGE_TIMEOUT) {
                     queues_next_expiry = elem_expiry;
                 }
                 break;
             }
 
-            last = cs;
-            rv = apr_pollset_remove(event_pollset, &cs->pfd);
-            if (rv != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rv)) {
-                AP_DEBUG_ASSERT(0);
-                ap_log_cerror(APLOG_MARK, APLOG_ERR, rv, cs->c, APLOGNO(00473)
-                              "apr_pollset_remove failed");
+            TO_QUEUE_REMOVE(qp, cs);
+            if (!pollset_del(cs, 1)) {
+                shutdown_connection(cs);
+                continue;
             }
+
+            if (cs == first) {
+                APR_RING_INSERT_HEAD(&qp->head, cs, event_conn_state_t,
+                                     timeout_list);
+            }
+            else {
+                APR_RING_INSERT_AFTER(last, cs, timeout_list);
+            }
+            ++*qp->total;
+            ++qp->count;
+
+            last = cs;
             cs = APR_RING_NEXT(cs, timeout_list);
             count++;
         }
@@ -1925,7 +2129,7 @@ static void process_timeout_queue(struct timeout_queue *q, apr_time_t expiry,
     first = APR_RING_FIRST(&trash.timeout_list);
     do {
         cs = APR_RING_NEXT(first, timeout_list);
-        TO_QUEUE_ELEM_INIT(first);
+        APR_RING_ELEM_INIT(cs, timeout_list);
         func(first);
         first = cs;
     } while (--total);
@@ -1950,8 +2154,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
     apr_status_t rc;
     proc_info *ti = dummy;
     int process_slot = ti->pslot;
-    struct process_score *ps = ap_get_scoreboard_process(process_slot);
-    int closed = 0;
+    process_score *ps = ap_get_scoreboard_process(process_slot);
     int have_idle_worker = 0;
     apr_time_t last_log;
 
@@ -1969,31 +2172,37 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
     unblock_signal(LISTENER_SIGNAL);
 
     for (;;) {
-        timer_event_t *te;
-        const apr_pollfd_t *out_pfd;
         apr_int32_t num = 0;
-        apr_interval_time_t timeout;
-        socket_callback_baton_t *user_chain;
-        apr_time_t now, expiry = -1;
+        apr_time_t next_expiry = -1;
+        apr_interval_time_t timeout = -1;
         int workers_were_busy = 0;
+        socket_callback_baton_t *user_chain;
+        const apr_pollfd_t *out_pfd;
+        apr_time_t now;
+        event_conn_state_t *cs;
+        timer_event_t *te;
 
-        if (conns_this_child <= 0)
+        if (conns_this_child <= 0) {
+            /* Gracefuly stop (eventually) and keep going */
             check_infinite_requests();
+        }
 
         if (listener_may_exit) {
-            int first_close = close_listeners(&closed);
+            int once = !dying;
+            if (once) {
+                set_child_dying();
+            }
 
             if (terminate_mode == ST_UNGRACEFUL
                 || apr_atomic_read32(&connection_count) == 0)
                 break;
 
-            /* Don't wait in poll() for the first close (i.e. dying now), we
-             * want to maintain the queues and schedule defer_linger_chain ASAP
-             * to kill kept-alive connection and shutdown the workers and child
-             * faster.
-             */
-            if (first_close) {
-                goto do_maintenance; /* with expiry == -1 */
+            if (once) {
+                /* Don't wait in poll() the first time (i.e. dying now), we
+                 * want to maintain the queues ASAP to shutdown the workers
+                 * and exit the child faster.
+                 */
+                goto do_maintenance; /* with next_expiry == -1 */
             }
         }
 
@@ -2002,8 +2211,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             /* trace log status every second */
             if (now - last_log > apr_time_from_sec(1)) {
                 ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf,
-                             "connections: %u (waitio:%u write-completion:%u"
-                             "keep-alive:%u lingering:%u suspended:%u clogged:%u), "
+                             "connections: %u (waitio:%u write:%u keepalive:%u "
+                             "lingering:%u suspended:%u clogged:%u), "
                              "workers: %u/%u shutdown",
                              apr_atomic_read32(&connection_count),
                              apr_atomic_read32(waitio_q->total),
@@ -2034,11 +2243,11 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
         now = apr_time_now();
         timeout = -1;
 
-        /* Push expired timers to a worker, the first remaining one determines
-         * the maximum time to poll() below, if any.
+        /* Push expired timers to a worker, the first remaining one (if any)
+         * determines the maximum time to poll() below.
          */
-        expiry = timers_next_expiry;
-        if (expiry && expiry < now) {
+        next_expiry = timers_next_expiry;
+        if (next_expiry && next_expiry <= now) {
             apr_thread_mutex_lock(g_timer_skiplist_mtx);
             while ((te = apr_skiplist_peek(timer_skiplist))) {
                 if (te->when > now) {
@@ -2047,56 +2256,67 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                     break;
                 }
                 apr_skiplist_pop(timer_skiplist, NULL);
-                if (!te->canceled) { 
-                    if (te->pfds) {
-                        /* remove all sockets from the pollset */
-                        apr_pool_cleanup_run(te->pfds->pool, te->pfds,
-                                             event_cleanup_poll_callback);
-                    }
-                    push_timer2worker(te);
+
+                if (te->canceled) {
+                    put_timer_event(te, 1);
+                    continue;
                 }
-                else {
-                    APR_RING_INSERT_TAIL(&timer_free_ring.link, te,
-                                         timer_event_t, link);
+
+                if (te->pfds) {
+                    /* remove all sockets from the pollset */
+                    apr_pool_cleanup_run(te->pfds->pool, te->pfds,
+                                         event_cleanup_poll_callback);
                 }
+                push_timer2worker(te);
+            }
+            if (te) {
+                next_expiry = te->when;
             }
-            if (!te) {
-                timers_next_expiry = 0;
+            else {
+                next_expiry = 0;
             }
+            timers_next_expiry = next_expiry;
             apr_thread_mutex_unlock(g_timer_skiplist_mtx);
         }
+        if (next_expiry) {
+            timeout = next_expiry > now ? next_expiry - now : 0;
+        }
 
         /* Same for queues, use their next expiry, if any. */
-        expiry = queues_next_expiry;
-        if (expiry
-                && (timeout < 0
-                    || expiry <= now
-                    || timeout > expiry - now)) {
-            timeout = expiry > now ? expiry - now : 0;
+        next_expiry = queues_next_expiry;
+        if (next_expiry && (timeout < 0 || next_expiry - now < timeout)) {
+            timeout = next_expiry > now ? next_expiry - now : 0;
         }
 
         /* When non-wakeable, don't wait more than 100 ms, in any case. */
-#define NON_WAKEABLE_POLL_TIMEOUT apr_time_from_msec(100)
-        if (!listener_is_wakeable
-                && (timeout < 0
-                    || timeout > NON_WAKEABLE_POLL_TIMEOUT)) {
-            timeout = NON_WAKEABLE_POLL_TIMEOUT;
+        if (!listener_is_wakeable && (timeout < 0 || timeout > NON_WAKEABLE_TIMEOUT)) {
+            timeout = NON_WAKEABLE_TIMEOUT;
         }
         else if (timeout > 0) {
-            /* apr_pollset_poll() might round down the timeout to milliseconds,
-             * let's forcibly round up here to never return before the timeout.
+            /* apr_pollset_poll() might round down the timeout to
+             * milliseconds, let's forcibly round up here to never
+             * return before the timeout.
              */
             timeout = apr_time_from_msec(
                 apr_time_as_msec(timeout + apr_time_from_msec(1) - 1)
             );
         }
 
+        /* Unpause listening sockets before poll()ing if possible */
+        if (should_enable_listensocks()) {
+            enable_listensocks();
+        }
+
         ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf,
-                     "polling with timeout=%" APR_TIME_T_FMT
+                     "pollset: wait for timeout=%" APR_TIME_T_FMT
                      " queues_timeout=%" APR_TIME_T_FMT
-                     " timers_timeout=%" APR_TIME_T_FMT,
-                     timeout, queues_next_expiry - now,
-                     timers_next_expiry - now);
+                     " timers_timeout=%" APR_TIME_T_FMT
+                     " conns=%d exit=%d/%d",
+                     timeout,
+                     queues_next_expiry ? queues_next_expiry - now : -1,
+                     timers_next_expiry ? timers_next_expiry - now : -1,
+                     apr_atomic_read32(&connection_count),
+                     listener_may_exit, dying);
 
         rc = apr_pollset_poll(event_pollset, timeout, &num, &out_pfd);
         if (rc != APR_SUCCESS) {
@@ -2105,59 +2325,55 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                              APLOGNO(03267)
                              "apr_pollset_poll failed.  Attempting to "
                              "shutdown process gracefully");
+                AP_DEBUG_ASSERT(0);
                 signal_threads(ST_GRACEFUL);
             }
             num = 0;
         }
 
         if (APLOGtrace7(ap_server_conf)) {
+            apr_time_t old_now = now;
             now = apr_time_now();
+
             ap_log_error(APLOG_MARK, APLOG_TRACE7, rc, ap_server_conf,
-                         "polled with num=%u exit=%d/%d conns=%d"
+                         "pollset: have #%i time=%" APR_TIME_T_FMT "/%" APR_TIME_T_FMT
                          " queues_timeout=%" APR_TIME_T_FMT
-                         " timers_timeout=%" APR_TIME_T_FMT,
-                         num, listener_may_exit, dying,
+                         " timers_timeout=%" APR_TIME_T_FMT
+                         " conns=%d exit=%d/%d",
+                         (int)num, now - old_now, timeout,
+                         queues_next_expiry ? queues_next_expiry - now : -1,
+                         timers_next_expiry ? timers_next_expiry - now : -1,
                          apr_atomic_read32(&connection_count),
-                         queues_next_expiry - now, timers_next_expiry - now);
+                         listener_may_exit, dying);
         }
 
         /* XXX possible optimization: stash the current time for use as
          * r->request_time for new requests or queues maintenance
          */
 
-        for (user_chain = NULL; num; --num, ++out_pfd) {
-            listener_poll_type *pt = (listener_poll_type *) out_pfd->client_data;
+        for (user_chain = NULL; num > 0; --num, ++out_pfd) {
+            listener_poll_type *pt = out_pfd->client_data;
+
             if (pt->type == PT_CSD) {
                 /* one of the sockets is readable */
-                event_conn_state_t *cs = (event_conn_state_t *) pt->baton;
-                struct timeout_queue *remove_from_q = NULL;
-                /* don't wait for a worker for a keepalive request or
-                 * lingering close processing. */
-                int blocking = 0;
-
-                switch (cs->pub.state) {
-                case CONN_STATE_WRITE_COMPLETION:
-                    remove_from_q = cs->sc->wc_q;
-                    blocking = 1;
-                    break;
+                int blocking = 1;
 
-                case CONN_STATE_ASYNC_WAITIO:
-                    cs->pub.state = CONN_STATE_PROCESSING;
-                    remove_from_q = cs->sc->io_q;
-                    blocking = 1;
-                    break;
+                cs = (event_conn_state_t *) pt->baton;
+                ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
+                              "polled connection %" CS_FMT,
+                              CS_ARG(cs));
 
+                switch (cs->pub.state) {
                 case CONN_STATE_KEEPALIVE:
+                case CONN_STATE_ASYNC_WAITIO:
                     cs->pub.state = CONN_STATE_PROCESSING;
-                    remove_from_q = cs->sc->ka_q;
+                case CONN_STATE_WRITE_COMPLETION:
                     break;
 
                 case CONN_STATE_LINGER_NORMAL:
-                    remove_from_q = linger_q;
-                    break;
-
                 case CONN_STATE_LINGER_SHORT:
-                    remove_from_q = short_linger_q;
+                    /* don't wait for a worker for lingering close processing. */
+                    blocking = 0;
                     break;
 
                 default:
@@ -2168,26 +2384,12 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                     ap_assert(0);
                 }
 
-                if (remove_from_q) {
-                    apr_thread_mutex_lock(timeout_mutex);
-                    TO_QUEUE_REMOVE(remove_from_q, cs);
-                    rc = apr_pollset_remove(event_pollset, &cs->pfd);
-                    apr_thread_mutex_unlock(timeout_mutex);
-                    /*
-                     * Some of the pollset backends, like KQueue or Epoll
-                     * automagically remove the FD if the socket is closed,
-                     * therefore, we can accept _SUCCESS or _NOTFOUND,
-                     * and we still want to keep going
-                     */
-                    if (rc != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rc)) {
-                        AP_DEBUG_ASSERT(0);
-                        ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf,
-                                     APLOGNO(03094) "pollset remove failed");
-                        close_connection(cs);
-                        signal_threads(ST_GRACEFUL);
-                        break;
-                    }
+                if (!pollset_del(cs, 0)) {
+                    shutdown_connection(cs);
+                    continue;
+                }
 
+                {
                     /* If we don't get a worker immediately (nonblocking), we
                      * close the connection; the client can re-connect to a
                      * different process for keepalive, and for lingering close
@@ -2269,14 +2471,21 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                         resource_shortage = 1;
                         signal_threads(ST_GRACEFUL);
                     }
-                    else if (ap_accept_error_is_nonfatal(rc)) { 
-                        ap_log_error(APLOG_MARK, APLOG_DEBUG, rc, ap_server_conf, 
+                    else if (ap_accept_error_is_nonfatal(rc)) {
+                        ap_log_error(APLOG_MARK, APLOG_DEBUG, rc, ap_server_conf,
                                      "accept() on client socket failed");
                     }
 
                     if (csd != NULL) {
                         conns_this_child--;
-                        if (push2worker(NULL, csd, ptrans) == APR_SUCCESS) {
+
+                        /* Create and account for the connection from here, or
+                         * a graceful shutdown happening before it's processed
+                         * would consider it does not exist and could exit the
+                         * child too early.
+                         */
+                        cs = make_conn_state(ptrans, csd);
+                        if (push2worker(cs, NULL, NULL) == APR_SUCCESS) {
                             have_idle_worker = 0;
                         }
                     }
@@ -2304,7 +2513,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                  * with the user callback being called while we handle
                  * the same baton multiple times here.
                  */
-                if (!baton->signaled) { 
+                if (!baton->signaled) {
                     baton->signaled = 1;
                     baton->next = user_chain;
                     user_chain = baton;
@@ -2312,7 +2521,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             }
         } /* for processing poll */
 
-        /* Time to handle user callbacks chained above */
+        /* Time to queue user callbacks chained above */
         while (user_chain) {
             socket_callback_baton_t *baton = user_chain;
             user_chain = user_chain->next;
@@ -2323,30 +2532,31 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                                  event_cleanup_poll_callback);
 
             /* masquerade as a timer event that is firing */
-            te = event_get_timer_event(-1 /* fake timer */, 
-                                       baton->cbfunc, 
-                                       baton->user_baton, 
-                                       0, /* don't insert it */
-                                       NULL /* no associated socket callback */);
+            te = get_timer_event(-1 /* fake timer */,
+                                 baton->cbfunc,
+                                 baton->user_baton,
+                                 0, /* don't insert it */
+                                 NULL /* no associated socket callback */);
             push_timer2worker(te);
         }
 
         /* We process the timeout queues here only when the global
-         * queues_next_expiry is passed. This happens accurately since
+         * queues_next_expiry has passed. This happens accurately since
          * adding to the queues (in workers) can only decrease this expiry,
          * while latest ones are only taken into account here (in listener)
          * during queues' processing, with the lock held. This works both
          * with and without wake-ability.
          */
-        expiry = queues_next_expiry;
+        next_expiry = queues_next_expiry;
 do_maintenance:
-        if (expiry && expiry < (now = apr_time_now())) {
+        if (next_expiry && next_expiry <= (now = apr_time_now())) {
             ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf,
-                         "queues maintenance with timeout=%" APR_TIME_T_FMT,
-                         expiry > 0 ? expiry - now : -1);
+                         "queues maintenance: expired=%" APR_TIME_T_FMT,
+                         next_expiry > 0 ? now - next_expiry : -1);
+
             apr_thread_mutex_lock(timeout_mutex);
 
-            /* Steps below will recompute this. */
+            /* Recompute this by walking the timeout queues (under the lock) */
             queues_next_expiry = 0;
 
             /* Step 1: keepalive queue timeouts are closed */
@@ -2373,11 +2583,12 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             /* Step 5: short lingering close queue timeouts are closed */
             process_timeout_queue(short_linger_q, now, shutdown_connection);
 
+            next_expiry = queues_next_expiry;
             apr_thread_mutex_unlock(timeout_mutex);
+
             ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf,
-                         "queues maintained with timeout=%" APR_TIME_T_FMT,
-                         queues_next_expiry > now ? queues_next_expiry - now
-                                                  : -1);
+                         "queues maintained: next timeout=%" APR_TIME_T_FMT,
+                         next_expiry ? next_expiry - now : -1);
 
             ps->wait_io = apr_atomic_read32(waitio_q->total);
             ps->write_completion = apr_atomic_read32(write_completion_q->total);
@@ -2411,12 +2622,11 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                 have_idle_worker = 0;
             }
         }
-
-        if (!workers_were_busy && should_enable_listensocks()) {
-            enable_listensocks();
-        }
     } /* listener main loop */
 
+    ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+                 "listener thread exiting");
+
     ap_queue_term(worker_queue);
 
     apr_thread_exit(thd, APR_SUCCESS);
@@ -2429,23 +2639,25 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
  *
  * return 1 if thread should exit, 0 if it should continue running.
  */
-static int worker_thread_should_exit_early(void)
+static int worker_thread_should_exit_early(int slot)
 {
+    const apr_uint32_t max = threads_per_child;
     for (;;) {
         apr_uint32_t conns = apr_atomic_read32(&connection_count);
-        apr_uint32_t dead = apr_atomic_read32(&threads_shutdown);
-        apr_uint32_t newdead;
+        apr_uint32_t deads = apr_atomic_read32(&threads_shutdown);
 
-        AP_DEBUG_ASSERT(dead <= threads_per_child);
-        if (conns >= threads_per_child - dead)
+        AP_DEBUG_ASSERT(deads < max);
+        if (conns >= max - deads)
             return 0;
 
-        newdead = dead + 1;
-        if (apr_atomic_cas32(&threads_shutdown, newdead, dead) == dead) {
+        if (apr_atomic_cas32(&threads_shutdown, deads + 1, deads) == deads) {
             /*
              * No other thread has exited in the mean time, safe to exit
              * this one.
              */
+            ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+                         "worker thread %i/%i-%i should exit (%i conns)",
+                         slot, threads_per_child, deads + 1, conns);
             return 1;
         }
     }
@@ -2463,20 +2675,21 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy)
     proc_info *ti = dummy;
     int process_slot = ti->pslot;
     int thread_slot = ti->tslot;
+    worker_score *ws = &ap_scoreboard_image->servers[process_slot][thread_slot];
     apr_status_t rv;
     int is_idle = 0;
 
     free(ti);
 
-    ap_scoreboard_image->servers[process_slot][thread_slot].pid = ap_my_pid;
-    ap_scoreboard_image->servers[process_slot][thread_slot].tid = apr_os_thread_current();
-    ap_scoreboard_image->servers[process_slot][thread_slot].generation = retained->mpm->my_generation;
+    ws->pid = ap_my_pid;
+    ws->tid = apr_os_thread_current();
+    ws->generation = retained->mpm->my_generation;
     ap_update_child_status_from_indexes(process_slot, thread_slot,
                                         SERVER_STARTING, NULL);
 
     for (;;) {
         apr_socket_t *csd = NULL;
-        event_conn_state_t *cs;
+        event_conn_state_t *cs = NULL;
         timer_event_t *te = NULL;
         apr_pool_t *ptrans;         /* Pool for per-transaction stuff */
 
@@ -2490,23 +2703,33 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy)
                 signal_threads(ST_GRACEFUL);
                 break;
             }
-            /* A new idler may have changed connections_above_limit(),
-             * let the listener know and decide.
+            ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf,
+                         "worker thread %i/%i idle (idlers %i)",
+                         thread_slot, threads_per_child,
+                         ap_queue_info_num_idlers(worker_queue_info));
+            is_idle = 1;
+
+            /* If the listening sockets are paused and this new idler switches
+             * connections_above_limit() back, let the listener know and poll
+             * them again.
              */
             if (listener_is_wakeable && should_enable_listensocks()) {
                 apr_pollset_wakeup(event_pollset);
             }
-            is_idle = 1;
         }
 
         ap_update_child_status_from_indexes(process_slot, thread_slot,
                                             dying ? SERVER_GRACEFUL
-                                                  : SERVER_READY, NULL);
-      worker_pop:
+                                                  : SERVER_READY,
+                                            NULL);
+
         if (workers_may_exit) {
+            ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+                         "worker thread %i/%i may exit",
+                         thread_slot, threads_per_child);
             break;
         }
-        if (dying && worker_thread_should_exit_early()) {
+        if (dying && worker_thread_should_exit_early(thread_slot)) {
             break;
         }
 
@@ -2518,8 +2741,12 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy)
              * connections accepted by this server process have been handled.
              */
             if (APR_STATUS_IS_EOF(rv)) {
+                ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+                             "worker thread %i/%i queue terminated",
+                             thread_slot, threads_per_child);
                 break;
             }
+
             /* We get APR_EINTR whenever ap_queue_pop_*() has been interrupted
              * from an explicit call to ap_queue_interrupt_all(). This allows
              * us to unblock threads stuck in ap_queue_pop_*() when a shutdown
@@ -2531,26 +2758,29 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy)
              * may have already been cleaned up.  Don't log the "error" if
              * workers_may_exit is set.
              */
-            else if (APR_STATUS_IS_EINTR(rv)) {
-                goto worker_pop;
-            }
-            /* We got some other error. */
-            else if (!workers_may_exit) {
+            if (!APR_STATUS_IS_EINTR(rv) && !workers_may_exit) {
                 ap_log_error(APLOG_MARK, APLOG_CRIT, rv, ap_server_conf,
-                             APLOGNO(03099) "ap_queue_pop_socket failed");
+                             APLOGNO(03099) "ap_queue_pop_something failed");
+                AP_DEBUG_ASSERT(0);
+                signal_threads(ST_GRACEFUL);
             }
             continue;
         }
+
+        ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf,
+                     "worker thread %i/%i busy (idlers %i)",
+                     thread_slot, threads_per_child,
+                     ap_queue_info_num_idlers(worker_queue_info));
+
         if (te != NULL) {
-            te->cbfunc(te->baton);
-            {
-                apr_thread_mutex_lock(g_timer_skiplist_mtx);
-                APR_RING_INSERT_TAIL(&timer_free_ring.link, te, timer_event_t, link);
-                apr_thread_mutex_unlock(g_timer_skiplist_mtx);
-            }
+            void *baton = te->baton;
+            ap_mpm_callback_fn_t *cbfunc = te->cbfunc;
+            /* first recycle the timer event */
+            put_timer_event(te, 0);
+            cbfunc(baton);
         }
         else {
-            is_idle = 0;
+            is_idle = 0; /* consumed */
             if (csd != NULL) {
                 worker_sockets[thread_slot] = csd;
                 process_socket(thd, ptrans, csd, cs, process_slot, thread_slot);
@@ -2572,15 +2802,23 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy)
             cs->chain = NULL;
             AP_DEBUG_ASSERT(cs->pub.state == CONN_STATE_LINGER);
 
-            worker_sockets[thread_slot] = csd = cs->pfd.desc.s;
+            ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
+                          "deferred close for connection %" CS_FMT, CS_ARG(cs));
+
+            worker_sockets[thread_slot] = csd = cs_sd(cs);
             process_socket(thd, cs->p, csd, cs, process_slot, thread_slot);
             worker_sockets[thread_slot] = NULL;
         }
     }
+    if (is_idle) {
+        /* Not idling anymore */
+        ap_queue_info_wait_for_idler(worker_queue_info, NULL);
+    }
 
     ap_update_child_status_from_indexes(process_slot, thread_slot,
                                         dying ? SERVER_DEAD
-                                              : SERVER_GRACEFUL, NULL);
+                                              : SERVER_GRACEFUL,
+                                        NULL);
 
     apr_thread_exit(thd, APR_SUCCESS);
     return NULL;
@@ -2623,14 +2861,14 @@ static void setup_threads_runtime(void)
     ap_listen_rec *lr;
     apr_pool_t *pskip = NULL;
     int max_recycled_pools = -1, i;
-    const int good_methods[] = { APR_POLLSET_KQUEUE,
-                                 APR_POLLSET_PORT,
+    const int good_methods[] = { APR_POLLSET_PORT,
+                                 APR_POLLSET_KQUEUE,
                                  APR_POLLSET_EPOLL };
     /* XXX: K-A or lingering close connection included in the async factor */
-    const apr_uint32_t async_factor = worker_factor / WORKER_FACTOR_SCALE;
-    const apr_uint32_t pollset_size = (apr_uint32_t)num_listensocks +
-                                      (apr_uint32_t)threads_per_child *
-                                      (async_factor > 2 ? async_factor : 2);
+    const unsigned int threads_factor = worker_factor / WORKER_FACTOR_SCALE;
+    const apr_size_t pollset_size = ((unsigned int)num_listensocks +
+                                     (unsigned int)threads_per_child *
+                                     (threads_factor > 2 ? threads_factor : 2));
     int pollset_flags;
 
     /* Event's skiplist operations will happen concurrently with other modules'
@@ -2730,13 +2968,13 @@ static void setup_threads_runtime(void)
     }
 
     /* Add listeners to the main pollset */
-    listener_pollfd = apr_pcalloc(pruntime, num_listensocks *
-                                            sizeof(apr_pollfd_t));
+    listener_pollfd = apr_pcalloc(pruntime,
+                                  num_listensocks * sizeof(apr_pollfd_t));
     for (i = 0, lr = my_bucket->listeners; lr; lr = lr->next, i++) {
         apr_pollfd_t *pfd;
         listener_poll_type *pt;
 
-        AP_DEBUG_ASSERT(i < num_listensocks);
+        ap_assert(i < num_listensocks);
         pfd = &listener_pollfd[i];
 
         pfd->reqevents = APR_POLLIN | APR_POLLHUP | APR_POLLERR;
@@ -2758,7 +2996,12 @@ static void setup_threads_runtime(void)
         pt->baton = lr;
 
         apr_socket_opt_set(pfd->desc.s, APR_SO_NONBLOCK, 1);
-        apr_pollset_add(event_pollset, pfd);
+        rv = apr_pollset_add(event_pollset, pfd);
+        if (rv != APR_SUCCESS) {
+            ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(10473)
+                         "apr_pollset_add for listener failed.");
+            clean_child_exit(APEXIT_CHILDFATAL);
+        }
 
         lr->accept_func = ap_unixd_accept;
     }
@@ -2906,7 +3149,7 @@ static void join_workers(apr_thread_t * listener, apr_thread_t ** threads)
             /* listener has not stopped accepting yet */
             ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf,
                          "listener has not stopped accepting yet (%d iter)", iter);
-            wakeup_listener();
+            shutdown_listener();
         }
         if (iter > 10) {
             ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, APLOGNO(00475)
@@ -2922,6 +3165,9 @@ static void join_workers(apr_thread_t * listener, apr_thread_t ** threads)
     }
 
     for (i = 0; i < threads_per_child; i++) {
+        ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+                     "apr_thread_join: joining thread %pp (%i/%i)",
+                     threads[i], i, threads_per_child);
         if (threads[i]) {       /* if we ever created this thread */
             rv = apr_thread_join(&thread_rv, threads[i]);
             if (rv != APR_SUCCESS) {
@@ -3043,7 +3289,7 @@ static void child_main(int child_num_arg, int child_bucket)
         if (rv != APR_SUCCESS && rv != APR_ENOTIMPL) {
             ap_log_error(APLOG_MARK, APLOG_WARNING, rv, ap_server_conf, APLOGNO(02436)
                          "WARNING: ThreadStackSize of %" APR_SIZE_T_FMT " is "
-                         "inappropriate, using default", 
+                         "inappropriate, using default",
                          ap_thread_stacksize);
         }
     }
@@ -3384,7 +3630,7 @@ static void perform_idle_server_maintenance(void)
                     retained->maxclients_reported = 1;
                 }
              }
-             else { 
+             else {
                 if (!retained->near_maxclients_reported) {
                     ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf, APLOGNO(10159)
                             "server is within MinSpareThreads of "
@@ -3490,7 +3736,7 @@ static void server_main_loop(int remaining_children_to_start)
             child_slot = ap_find_child_by_pid(&pid);
             if (processed_status == APEXIT_CHILDFATAL) {
                 /* fix race condition found in PR 39311
-                 * A child created at the same time as a graceful happens 
+                 * A child created at the same time as a graceful happens
                  * can find the lock missing and create a fatal error.
                  * It is not fatal for the last generation to be in this state.
                  */
@@ -3866,25 +4112,23 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
     return OK;
 }
 
-static void setup_slave_conn(conn_rec *c, void *csd) 
+static void setup_slave_conn(conn_rec *c, void *csd)
 {
     event_conn_state_t *mcs;
     event_conn_state_t *cs;
-    
+
     mcs = ap_get_module_config(c->master->conn_config, &mpm_event_module);
-    
-    cs = apr_pcalloc(c->pool, sizeof(*cs));
+
+    cs = make_conn_state(c->pool, csd);
     cs->c = c;
-    cs->r = NULL;
     cs->sc = mcs->sc;
     cs->suspended = 0;
-    cs->p = c->pool;
     cs->bucket_alloc = c->bucket_alloc;
     cs->pfd = mcs->pfd;
     cs->pub = mcs->pub;
     cs->pub.state = CONN_STATE_PROCESSING;
     cs->pub.sense = CONN_SENSE_DEFAULT;
-    
+
     c->cs = &(cs->pub);
     ap_set_module_config(c->conn_config, &mpm_event_module, cs);
 }
@@ -3908,7 +4152,7 @@ static int event_protocol_switch(conn_rec *c, request_rec *r, server_rec *s,
          * other than http/1.1, this might never happen.
          */
         event_conn_state_t *cs;
-        
+
         cs = ap_get_module_config(c->conn_config, &mpm_event_module);
         cs->sc = ap_get_module_config(s->module_config, &mpm_event_module);
     }
@@ -3932,7 +4176,11 @@ static int event_open_logs(apr_pool_t * p, apr_pool_t * plog,
         level_flags |= APLOG_STARTUP;
     }
 
-    if ((num_listensocks = ap_setup_listeners(ap_server_conf)) < 1) {
+    /* This sets up new listeners or reuses existing ones, as well as cleaning
+     * up unused ones from the previous generation.
+     */
+    num_listensocks = ap_setup_listeners(ap_server_conf);
+    if (num_listensocks < 1) {
         ap_log_error(APLOG_MARK, APLOG_ALERT | level_flags, 0,
                      (startup ? NULL : s), APLOGNO(03272)
                      "no listening sockets available, shutting down");
@@ -4045,74 +4293,34 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog,
 static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog,
                              apr_pool_t *ptemp, server_rec *s)
 {
-    struct {
-        struct timeout_queue *tail, *q;
-        apr_hash_t *hash;
-    } io, wc, ka;
+    apr_hash_t *io_h, *wc_h, *ka_h;
 
     /* Not needed in pre_config stage */
     if (ap_state_query(AP_SQ_MAIN_STATE) == AP_SQ_MS_CREATE_PRE_CONFIG) {
         return OK;
     }
 
-    io.hash = apr_hash_make(ptemp);
-    wc.hash = apr_hash_make(ptemp);
-    ka.hash = apr_hash_make(ptemp);
-    io.tail = wc.tail = ka.tail = NULL;
+    io_h = apr_hash_make(ptemp);
+    wc_h = apr_hash_make(ptemp);
+    ka_h = apr_hash_make(ptemp);
 
-    linger_q = TO_QUEUE_MAKE(pconf, apr_time_from_sec(MAX_SECS_TO_LINGER),
-                             NULL);
-    short_linger_q = TO_QUEUE_MAKE(pconf, apr_time_from_sec(SECONDS_TO_LINGER),
-                                   NULL);
+    linger_q = TO_QUEUE_MAKE(pconf, "linger",
+                             apr_time_from_sec(MAX_SECS_TO_LINGER), NULL);
+    short_linger_q = TO_QUEUE_MAKE(pconf, "short_linger",
+                                   apr_time_from_sec(SECONDS_TO_LINGER), NULL);
 
     for (; s; s = s->next) {
         event_srv_cfg *sc = apr_pcalloc(pconf, sizeof *sc);
-
         ap_set_module_config(s->module_config, &mpm_event_module, sc);
-        if (!io.tail) {
-            /* The main server uses the global queues */
-            io.q = TO_QUEUE_MAKE(pconf, s->timeout, NULL);
-            apr_hash_set(io.hash, &s->timeout, sizeof s->timeout, io.q);
-            io.tail = waitio_q = io.q;
-
-            wc.q = TO_QUEUE_MAKE(pconf, s->timeout, NULL);
-            apr_hash_set(wc.hash, &s->timeout, sizeof s->timeout, wc.q);
-            wc.tail = write_completion_q = wc.q;
-
-            ka.q = TO_QUEUE_MAKE(pconf, s->keep_alive_timeout, NULL);
-            apr_hash_set(ka.hash, &s->keep_alive_timeout,
-                         sizeof s->keep_alive_timeout, ka.q);
-            ka.tail = keepalive_q = ka.q;
-        }
-        else {
-            /* The vhosts use any existing queue with the same timeout,
-             * or their own queue(s) if there isn't */
-            io.q = apr_hash_get(io.hash, &s->timeout, sizeof s->timeout);
-            if (!io.q) {
-                io.q = TO_QUEUE_MAKE(pconf, s->timeout, io.tail);
-                apr_hash_set(io.hash, &s->timeout, sizeof s->timeout, io.q);
-                io.tail = io.tail->next = io.q;
-            }
 
-            wc.q = apr_hash_get(wc.hash, &s->timeout, sizeof s->timeout);
-            if (!wc.q) {
-                wc.q = TO_QUEUE_MAKE(pconf, s->timeout, wc.tail);
-                apr_hash_set(wc.hash, &s->timeout, sizeof s->timeout, wc.q);
-                wc.tail = wc.tail->next = wc.q;
-            }
+        sc->io_q = TO_QUEUE_CHAIN(pconf, "waitio", s->timeout,
+                                  &waitio_q, io_h, ptemp);
 
-            ka.q = apr_hash_get(ka.hash, &s->keep_alive_timeout,
-                                sizeof s->keep_alive_timeout);
-            if (!ka.q) {
-                ka.q = TO_QUEUE_MAKE(pconf, s->keep_alive_timeout, ka.tail);
-                apr_hash_set(ka.hash, &s->keep_alive_timeout,
-                             sizeof s->keep_alive_timeout, ka.q);
-                ka.tail = ka.tail->next = ka.q;
-            }
-        }
-        sc->io_q = io.q;
-        sc->wc_q = wc.q;
-        sc->ka_q = ka.q;
+        sc->wc_q = TO_QUEUE_CHAIN(pconf, "write_completion", s->timeout,
+                                  &write_completion_q, wc_h, ptemp);
+
+        sc->ka_q = TO_QUEUE_CHAIN(pconf, "keepalive", s->keep_alive_timeout,
+                                  &keepalive_q, ka_h, ptemp);
     }
 
     return OK;
@@ -4430,7 +4638,7 @@ static const char *set_threads_per_child(cmd_parms * cmd, void *dummy,
     threads_per_child = atoi(arg);
     return NULL;
 }
-static const char *set_server_limit (cmd_parms *cmd, void *dummy, const char *arg)
+static const char *set_server_limit(cmd_parms *cmd, void *dummy, const char *arg)
 {
     const char *err = ap_check_cmd_context(cmd, GLOBAL_ONLY);
     if (err != NULL) {
diff --git a/server/mpm/motorz/motorz.c b/server/mpm/motorz/motorz.c
index 7026d08cd6e..e06aeab573b 100644
--- a/server/mpm/motorz/motorz.c
+++ b/server/mpm/motorz/motorz.c
@@ -380,8 +380,8 @@ static apr_status_t motorz_io_process(motorz_conn_t *scon)
             scon->cs.state = CONN_STATE_PROCESSING;
         }
 
-read_request:
         if (scon->cs.state == CONN_STATE_PROCESSING) {
+ process_connection:
             ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, APLOGNO(03328)
                                  "motorz_io_process(): CONN_STATE_PROCESSING");
             if (!c->aborted) {
@@ -432,14 +432,14 @@ static apr_status_t motorz_io_process(motorz_conn_t *scon)
                 }
                 return APR_SUCCESS;
             }
-            if (c->keepalive != AP_CONN_KEEPALIVE) {
-                pending = DONE;
-            }
-            else if (pending == OK) {
-                pending = ap_check_input_pending(c);
+            if (pending == OK) {
+                /* Some data to process immediately? */
+                pending = (c->keepalive == AP_CONN_KEEPALIVE
+                           ? ap_check_input_pending(c)
+                           : DONE);
                 if (pending == AGAIN) {
                     scon->cs.state = CONN_STATE_PROCESSING;
-                    goto read_request;
+                    goto process_connection;
                 }
             }
             if (pending == OK) {
diff --git a/server/mpm/simple/simple_io.c b/server/mpm/simple/simple_io.c
index 36c5ad87956..154c9a2c1d3 100644
--- a/server/mpm/simple/simple_io.c
+++ b/server/mpm/simple/simple_io.c
@@ -126,11 +126,11 @@ static apr_status_t simple_io_process(simple_conn_t * scon)
                 }
                 return APR_SUCCESS;
             }
-            if (c->keepalive != AP_CONN_KEEPALIVE) {
-                pending = DONE;
-            }
-            else if (pending == OK) {
-                pending = ap_check_input_pending(c);
+            if (pending == OK) {
+                /* Some data to process immediately? */
+                pending = (c->keepalive == AP_CONN_KEEPALIVE
+                           ? ap_check_input_pending(c)
+                           : DONE);
                 if (pending == AGAIN) {
                     scon->cs.state = CONN_STATE_PROCESSING;
                     continue;

From db8ec1e53750901d0acf44a59e6346f9bd9c7b90 Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Tue, 1 Feb 2022 22:47:38 +0100
Subject: [PATCH 04/22] mpm_event: Use monotonic timestamps if available.

If clock_gettime() and CLOCK_MONOTONIC are defined (i.e. most/all?  unixes),
use them to provide a timestamp that never goes past (even if the admin
changes the system time). This avoids entries potentially suddenly expiring
in centuries on a bad clock skew.

* configure.in():
  Provide HAVE_TIME_H, HAVE_CLOCK_GETTIME and HAVE_CLOCK_GETRES.

* server/mpm/event/event.c(event_time_now):
  New helper to get a monotonic timestamp from clock_gettime() if it's
  available, or apr_time_now() (i.e. gettimeofday()) otherwise.

* server/mpm/event/event.c(process_socket, event_resume_suspended,
                           event_get_timer_event, process_lingering_close,
                           listener_thread, event_run):
  Use event_time_now().
---
 configure.in             |   5 ++
 server/mpm/event/event.c | 112 +++++++++++++++++++++++++++++++++++----
 2 files changed, 107 insertions(+), 10 deletions(-)

diff --git a/configure.in b/configure.in
index c56c8972afd..4b2098d8034 100644
--- a/configure.in
+++ b/configure.in
@@ -471,6 +471,8 @@ AC_CHECK_HEADERS( \
 string.h \
 limits.h \
 unistd.h \
+time.h \
+mach/mach_time.h \
 sys/socket.h \
 pwd.h \
 grp.h \
@@ -534,6 +536,9 @@ getpwnam \
 getgrnam \
 initgroups \
 bindprocessor \
+clock_getres \
+clock_gettime \
+clock_gettime_nsec_np \
 prctl \
 procctl \
 pthread_getthreadid_np \
diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index 64ff1e30ead..795f4b1f37c 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -73,6 +73,9 @@
 #ifdef HAVE_SYS_PROCESSOR_H
 #include <sys/processor.h>      /* for bindprocessor() */
 #endif
+#ifdef HAVE_TIME_H
+#include <time.h>               /* for clock_gettime() */
+#endif
 
 #if !APR_HAS_THREADS
 #error The Event MPM requires APR threads, but they are unavailable.
@@ -336,6 +339,93 @@ static APR_INLINE const char *cs_state_str(event_conn_state_t *cs)
  */
 static event_conn_state_t *volatile defer_linger_chain;
 
+#define USE_CLOCK_COARSE 0  /* not for now */
+#if HAVE_CLOCK_GETTIME && defined(CLOCK_MONOTONIC)            /* POSIX */
+static clockid_t event_clockid;
+#elif HAVE_CLOCK_GETTIME_NSEC_NP && defined(CLOCK_UPTIME_RAW) /* Newer OSX */
+/* All #include'd by <time.h> already */
+#elif HAVE_MACH_MACH_TIME_H                                   /* Older OSX */
+#include <mach/mach_time.h>
+#endif
+
+static void event_time_init(void)
+{
+#if HAVE_CLOCK_GETTIME && defined(CLOCK_MONOTONIC)
+    event_clockid = (clockid_t)-1;
+
+#if HAVE_CLOCK_GETRES && defined(CLOCK_MONOTONIC_COARSE) && USE_CLOCK_COARSE
+    if (event_clockid == (clockid_t)-1) {
+        struct timespec ts;
+        if (clock_getres(CLOCK_MONOTONIC_COARSE, &ts) == 0) {
+            apr_time_t res = apr_time_from_sec(ts.tv_sec) + ts.tv_nsec / 1000;
+            if (res <= TIMERS_FUDGE_TIMEOUT) {
+                event_clockid = CLOCK_MONOTONIC_COARSE;
+            }
+        }
+    }
+#endif /* CLOCK_MONOTONIC_COARSE */
+
+#if HAVE_CLOCK_GETRES && defined(CLOCK_MONOTONIC_FAST) && USE_CLOCK_COARSE
+    if (event_clockid == (clockid_t)-1) {
+        struct timespec ts;
+        if (clock_getres(CLOCK_MONOTONIC_FAST, &ts) == 0) {
+            apr_time_t res = apr_time_from_sec(ts.tv_sec) + ts.tv_nsec / 1000;
+            if (res <= TIMERS_FUDGE_TIMEOUT) {
+                event_clockid = CLOCK_MONOTONIC_FAST;
+            }
+        }
+    }
+#endif /* CLOCK_MONOTONIC_FAST */
+
+#if HAVE_CLOCK_GETRES && defined(CLOCK_MONOTONIC_RAW_APPROX) && USE_CLOCK_COARSE
+    if (event_clockid == (clockid_t)-1) {
+        struct timespec ts;
+        if (clock_getres(CLOCK_MONOTONIC_RAW_APPROX, &ts) == 0) {
+            apr_time_t res = apr_time_from_sec(ts.tv_sec) + ts.tv_nsec / 1000;
+            if (res <= TIMERS_FUDGE_TIMEOUT) {
+                event_clockid = CLOCK_MONOTONIC_RAW_APPROX;
+            }
+        }
+    }
+#endif /* CLOCK_MONOTONIC_RAW_APPROX */
+
+    if (event_clockid == (clockid_t)-1) {
+#if defined(CLOCK_MONOTONIC_RAW)
+        event_clockid = CLOCK_MONOTONIC_RAW;
+#else
+        event_clockid = CLOCK_MONOTONIC;
+#endif
+    }
+
+#endif /* HAVE_CLOCK_GETTIME */
+}
+
+static apr_time_t event_time_now(void)
+{
+#if HAVE_CLOCK_GETTIME && defined(CLOCK_MONOTONIC)
+
+    struct timespec ts;
+    clock_gettime(event_clockid, &ts);
+    return apr_time_from_sec(ts.tv_sec) + ts.tv_nsec / 1000;
+
+#elif HAVE_CLOCK_GETTIME_NSEC_NP && defined(CLOCK_UPTIME_RAW)
+
+    return clock_gettime_nsec_np(CLOCK_UPTIME_RAW) / 1000;
+
+#elif HAVE_MACH_MACH_TIME_H
+
+    mach_timebase_info_data_t ti;
+    mach_timebase_info(&ti);
+    return mach_continuous_time() * ti.numer / ti.denom / 1000;
+
+#else
+
+    /* XXX: not monotonic, still some platform to care about? */
+    return apr_time_now();
+
+#endif
+}
+
 APR_RING_HEAD(timeout_head_t, event_conn_state_t);
 struct timeout_queue {
     struct timeout_head_t head;
@@ -375,7 +465,7 @@ static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *cs)
     ap_assert(q && !cs->q);
 
     cs->q = q;
-    cs->queue_timestamp = apr_time_now();
+    cs->queue_timestamp = event_time_now();
     APR_RING_INSERT_TAIL(&q->head, cs, event_conn_state_t, timeout_list);
     ++*q->total;
     ++q->count;
@@ -1786,7 +1876,7 @@ static timer_event_t *get_timer_event(apr_time_t timeout,
                                       apr_array_header_t *pfds)
 {
     timer_event_t *te;
-    apr_time_t now = (timeout < 0) ? 0 : apr_time_now();
+    apr_time_t now = (timeout < 0) ? 0 : event_time_now();
 
     /* oh yeah, and make locking smarter/fine grained. */
 
@@ -2158,7 +2248,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
     int have_idle_worker = 0;
     apr_time_t last_log;
 
-    last_log = apr_time_now();
+    last_log = event_time_now();
     free(ti);
 
 #if HAVE_SERF
@@ -2207,7 +2297,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
         }
 
         if (APLOGtrace6(ap_server_conf)) {
-            now = apr_time_now();
+            now = event_time_now();
             /* trace log status every second */
             if (now - last_log > apr_time_from_sec(1)) {
                 ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf,
@@ -2240,7 +2330,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
          * up occurs, otherwise periodic checks (maintenance, shutdown, ...)
          * must be performed.
          */
-        now = apr_time_now();
+        now = event_time_now();
         timeout = -1;
 
         /* Push expired timers to a worker, the first remaining one (if any)
@@ -2333,7 +2423,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
 
         if (APLOGtrace7(ap_server_conf)) {
             apr_time_t old_now = now;
-            now = apr_time_now();
+            now = event_time_now();
 
             ap_log_error(APLOG_MARK, APLOG_TRACE7, rc, ap_server_conf,
                          "pollset: have #%i time=%" APR_TIME_T_FMT "/%" APR_TIME_T_FMT
@@ -2549,7 +2639,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
          */
         next_expiry = queues_next_expiry;
 do_maintenance:
-        if (next_expiry && next_expiry <= (now = apr_time_now())) {
+        if (next_expiry && next_expiry <= (now = event_time_now())) {
             ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf,
                          "queues maintenance: expired=%" APR_TIME_T_FMT,
                          next_expiry > 0 ? now - next_expiry : -1);
@@ -3257,7 +3347,7 @@ static void child_main(int child_num_arg, int child_bucket)
     }
 
     /* For rand() users (e.g. skiplist). */
-    srand((unsigned int)apr_time_now());
+    srand((unsigned int)event_time_now());
 
     ap_run_child_init(pchild, ap_server_conf);
 
@@ -4057,7 +4147,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
         }
 
         if (ap_graceful_shutdown_timeout) {
-            cutoff = apr_time_now() +
+            cutoff = event_time_now() +
                      apr_time_from_sec(ap_graceful_shutdown_timeout);
         }
 
@@ -4079,7 +4169,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
                 }
             }
         } while (!retained->mpm->shutdown_pending && active_children &&
-                 (!ap_graceful_shutdown_timeout || apr_time_now() < cutoff));
+                 (!ap_graceful_shutdown_timeout || event_time_now() < cutoff));
 
         /* We might be here because we received SIGTERM, either
          * way, try and make sure that all of our processes are
@@ -4210,6 +4300,8 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog,
         foreground = ap_exists_config_define("FOREGROUND");
     }
 
+    event_time_init();
+
     retained = ap_retained_data_get(userdata_key);
     if (!retained) {
         retained = ap_retained_data_create(userdata_key, sizeof(*retained));

From 8f3ed4cc7a3d20b864ee898930e189e39cce55fa Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Tue, 9 Jul 2024 15:53:33 +0200
Subject: [PATCH 05/22] mpm_event: No need/use of "clogged" connections count,
 axe.

---
 server/mpm/event/event.c | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index 795f4b1f37c..4e544ccdec9 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -205,7 +205,6 @@ static volatile int listener_may_exit = 0;
 static apr_uint32_t connection_count = 0;   /* Number of open connections */
 static apr_uint32_t lingering_count = 0;    /* Number of connections in lingering close */
 static apr_uint32_t suspended_count = 0;    /* Number of suspended connections */
-static apr_uint32_t clogged_count = 0;      /* Number of threads processing ssl conns */
 static apr_uint32_t threads_shutdown = 0;   /* Number of threads that have shutdown
                                                early during graceful termination */
 static int had_healthy_child = 0;
@@ -703,8 +702,7 @@ static void disable_listensocks(void)
 
     ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(10381)
                  "Suspend listening sockets: idlers:%i conns:%u "
-                 "waitio:%u write:%u keepalive:%u linger:%u/%u "
-                 "suspended:%u clogged:%u",
+                 "waitio:%u write:%u keepalive:%u linger:%u/%u suspended:%u",
                  ap_queue_info_num_idlers(worker_queue_info),
                  apr_atomic_read32(&connection_count),
                  apr_atomic_read32(waitio_q->total),
@@ -712,8 +710,7 @@ static void disable_listensocks(void)
                  apr_atomic_read32(keepalive_q->total),
                  apr_atomic_read32(linger_q->total),
                  apr_atomic_read32(short_linger_q->total),
-                 apr_atomic_read32(&suspended_count),
-                 apr_atomic_read32(&clogged_count));
+                 apr_atomic_read32(&suspended_count));
 
     ap_scoreboard_image->parent[ap_child_slot].not_accepting = 1;
 
@@ -732,8 +729,7 @@ static void enable_listensocks(void)
 
     ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(00457)
                  "Resume listening sockets: idlers:%i conns:%u "
-                 "waitio:%u write:%u keepalive:%u linger:%u/%u "
-                 "suspended:%u clogged:%u",
+                 "waitio:%u write:%u keepalive:%u linger:%u/%u suspended:%u",
                  ap_queue_info_num_idlers(worker_queue_info),
                  apr_atomic_read32(&connection_count),
                  apr_atomic_read32(waitio_q->total),
@@ -741,8 +737,7 @@ static void enable_listensocks(void)
                  apr_atomic_read32(keepalive_q->total),
                  apr_atomic_read32(linger_q->total),
                  apr_atomic_read32(short_linger_q->total),
-                 apr_atomic_read32(&suspended_count),
-                 apr_atomic_read32(&clogged_count));
+                 apr_atomic_read32(&suspended_count));
 
     /*
      * XXX: This is not yet optimal. If many workers suddenly become available,
@@ -1415,7 +1410,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
 {
     conn_rec *c = cs->c;
     long conn_id = ID_FROM_CHILD_THREAD(my_child_num, my_thread_num);
-    int rc = OK, processed = 0, clogging;
+    int rc = OK, processed = 0;
 
     if (!c) { /* This is a new connection */
         cs->bucket_alloc = apr_bucket_alloc_create(p);
@@ -1469,14 +1464,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
  process_connection:
         processed = 1;
         cs->pub.state = CONN_STATE_PROCESSING;
-        clogging = c->clogging_input_filters;
-        if (clogging) {
-            apr_atomic_inc32(&clogged_count);
-        }
         rc = ap_run_process_connection(c);
-        if (clogging) {
-            apr_atomic_dec32(&clogged_count);
-        }
         /*
          * The process_connection hooks should set the appropriate connection
          * state upon return, for event MPM to either:
@@ -2302,15 +2290,13 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             if (now - last_log > apr_time_from_sec(1)) {
                 ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf,
                              "connections: %u (waitio:%u write:%u keepalive:%u "
-                             "lingering:%u suspended:%u clogged:%u), "
-                             "workers: %u/%u shutdown",
+                             "lingering:%u suspended:%u), workers: %u/%u shutdown",
                              apr_atomic_read32(&connection_count),
                              apr_atomic_read32(waitio_q->total),
                              apr_atomic_read32(write_completion_q->total),
                              apr_atomic_read32(keepalive_q->total),
                              apr_atomic_read32(&lingering_count),
                              apr_atomic_read32(&suspended_count),
-                             apr_atomic_read32(&clogged_count),
                              apr_atomic_read32(&threads_shutdown),
                              threads_per_child);
                 last_log = now;

From f1367ba03edeaaa1ddd451b2561b69e20c976c13 Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Tue, 1 Feb 2022 22:24:15 +0100
Subject: [PATCH 06/22] mpm_event: Use r->server's Timeout after the
 post_read_request hook.

Regardless of keep_alive_timeout_set which anyway is only about the
KeepAliveTimeout to apply _after_ the current request, always use the
request's server Timeout during its processing (i.e. CONN_STATE_HEAR
and CONN_STATE_COMPLETION).

To save the next KeepAliveTimeout to use later, add a new event_srv_cfg
to the conn_state which points to the appropriate server (either r->server
or c->base_server depending on keep_alive_timeout_set as before).

* server/mpm/event/event.c(struct event_conn_state_t):
  Add event_srv_cfg *ka_sc as the server config to apply for kept alive
  connections.

* server/mpm/event/event.c(event_post_read_request):
  Always set cs->sc to the event_srv_cfg or the request's server, and
  point cs->ka_sc to the appropriate one according to keep_alive_timeout_set.

* server/mpm/event/event.c(make_conn_state):
  Initialize cs->ka_sc to the ap_server_conf's event_srv_cfg, like cs->sc.

* server/mpm/event/event.c(process_socket):
  Use cs->ka_sc->ka_q for CONN_STATE_KEEPALIVE.
---
 server/mpm/event/event.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index 4e544ccdec9..601a23dd9f6 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -251,6 +251,8 @@ struct event_conn_state_t {
     request_rec *r;
     /** server config this struct refers to */
     event_srv_cfg *sc;
+    /** server config this struct refers to during keepalive */
+    event_srv_cfg *ka_sc;
     /** scoreboard handle for the conn_rec */
     ap_sb_handle_t *sbh;
     /** bucket allocator */
@@ -1224,18 +1226,23 @@ static int event_post_read_request(request_rec *r)
     event_conn_state_t *cs = ap_get_module_config(c->conn_config,
                                                   &mpm_event_module);
 
+    /* Use Timeout from the request's server. */
+    cs->sc = ap_get_module_config(r->server->module_config,
+                                  &mpm_event_module);
+
     /* To preserve legacy behaviour (consistent with other MPMs), use
-     * the keepalive timeout from the base server (first on this IP:port)
-     * when none is explicitly configured on this server.
+     * KeepaliveTimeout from the base server (first on this IP:port)
+     * when none is explicitly configured on this server. Otherwise
+     * use the one from the request's server.
      */
-    if (r->server->keep_alive_timeout_set) {
-        cs->sc = ap_get_module_config(r->server->module_config,
-                                      &mpm_event_module);
+    if (!r->server->keep_alive_timeout_set) {
+        cs->ka_sc = ap_get_module_config(c->base_server->module_config,
+                                         &mpm_event_module);
     }
     else {
-        cs->sc = ap_get_module_config(c->base_server->module_config,
-                                      &mpm_event_module);
+        cs->ka_sc = cs->sc;
     }
+
     return OK;
 }
 
@@ -1352,8 +1359,8 @@ static event_conn_state_t *make_conn_state(apr_pool_t *p, apr_socket_t *csd)
 
     APR_RING_ELEM_INIT(cs, timeout_list);
 
-    cs->sc = ap_get_module_config(ap_server_conf->module_config,
-                                  &mpm_event_module);
+    cs->sc = cs->ka_sc = ap_get_module_config(ap_server_conf->module_config,
+                                              &mpm_event_module);
 
     /**
      * XXX If the platform does not have a usable way of bundling
@@ -1594,7 +1601,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
          */
         notify_suspend(cs);
 
-        if (!pollset_add(cs, CONN_SENSE_WANT_READ, cs->sc->ka_q)) {
+        if (!pollset_add(cs, CONN_SENSE_WANT_READ, cs->ka_sc->ka_q)) {
             apr_table_setn(cs->c->notes, "short-lingering-close", "1");
             cs->pub.state = CONN_STATE_LINGER;
             goto lingering_close;

From 589d21a0cacf822d905f3c37632be102b243921f Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Tue, 27 Jun 2023 03:26:56 +0200
Subject: [PATCH 07/22] mpm_event: Add kill_connection() to log (APLOG_INFO)
 interrupted connections.

---
 server/mpm/event/event.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index 601a23dd9f6..b58fc50bd94 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -1154,6 +1154,25 @@ static void close_connection_at(event_conn_state_t *cs,
 #define close_connection(cs) \
     close_connection_at((cs), __FUNCTION__, __LINE__)
 
+static void kill_connection_at(event_conn_state_t *cs, apr_status_t status,
+                               const char *at, int line)
+{
+    if (cs->c) {
+        ap_log_cerror(APLOG_MARK, APLOG_INFO, status, cs->c, APLOGNO(10382)
+                      "killing connection in %s at %s:%i",
+                      cs_state_str(cs), at, line);
+    }
+    else {
+        ap_log_error(APLOG_MARK, APLOG_INFO, status, ap_server_conf, APLOGNO(10383)
+                     "killing unprocessed connection from %pI in %s at %s:%i",
+                     cs_raddr(cs), cs_state_str(cs), at, line);
+    }
+
+    close_connection_at(cs, at, line);
+}
+#define kill_connection(cs, status) \
+    kill_connection_at((cs), (status), __FUNCTION__, __LINE__)
+
 /* forward declare */
 static void set_conn_state_sense(event_conn_state_t *cs, int sense);
 
@@ -1787,7 +1806,7 @@ static apr_status_t push2worker(event_conn_state_t *cs, apr_socket_t *csd,
          * socket to a worker
          */
         if (cs) {
-            shutdown_connection(cs);
+            kill_connection(cs, rc);
         }
         else {
             if (csd) {
@@ -2178,7 +2197,7 @@ static void process_timeout_queue(struct timeout_queue *q, apr_time_t expiry,
 
             TO_QUEUE_REMOVE(qp, cs);
             if (!pollset_del(cs, 1)) {
-                shutdown_connection(cs);
+                kill_connection(cs, APR_EGENERAL);
                 continue;
             }
 
@@ -2468,7 +2487,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                 }
 
                 if (!pollset_del(cs, 0)) {
-                    shutdown_connection(cs);
+                    /* Can't go anywhere, kill (and log) and next. */
+                    kill_connection(cs, APR_EGENERAL);
                     continue;
                 }
 

From 0ea6ae6162fbcdf4e659cacd16d07f93a61dbef7 Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Tue, 1 Feb 2022 17:17:11 +0100
Subject: [PATCH 08/22] core,mod_reqtimeout: Add ap_get_connection_timeout().

Provide a new min_connection_timeout hook that modules enforcing a
dynamic connection timeout (e.g. mod_reqtimeout) should use to inform
ap_get_connection_timeout() users about the current timeout being
applied.

Expose the current timeout enforced by mod_reqtimeout by implementing
the min_connection_timeout hook.

* include/ap_mmn.h():
  Minor bump for min_connection_timeout and ap_get_connection_timeout().

* include/http_connection.h():
  Declare min_connection_timeout and ap_get_connection_timeout().

* server/connection.c():
  Implement min_connection_timeout and ap_get_connection_timeout().

* modules/filters/mod_reqtimeout.c(struct reqtimeout_stage_t):
  Add server_timeout as the timeout defined for the server at the current
  stage.

* modules/filters/mod_reqtimeout.c(struct reqtimeout_con_cfg):
  Add time_left as the dynamic timeout enforced by mod_reqtimeout at the
  current stage.

* modules/filters/mod_reqtimeout.c(check_time_left):
  Store the computed time_left in the reqtimeout_con_cfg, and set the
  socket timeout there (returning an error which will be caught if that
  fails).

* modules/filters/mod_reqtimeout.c(extend_timeout):
  Update time_left in the reqtimeout_con_cfg per the time taken by the last
  read.

* modules/filters/mod_reqtimeout.c(reqtimeout_filter):
  Remove the special path for APR_NONBLOCK_READ or AP_MODE_EATCRLF, it
  does the exact same thing than the !(AP_MODE_GETLINE && APR_BLOCK_READ)
  one.

* modules/filters/mod_reqtimeout.c(reqtimeout_init, reqtimeout_before_header,
                                   reqtimeout_before_body, INIT_STAGE):
  Set the server_timeout in the current stage.

* modules/filters/mod_reqtimeout.c(reqtimeout_min_timeout):
  The new hook implementation.
---
 include/ap_mmn.h                 |   4 +-
 include/http_connection.h        |   5 ++
 modules/filters/mod_reqtimeout.c | 127 +++++++++++++++++++------------
 server/connection.c              |  16 ++++
 4 files changed, 101 insertions(+), 51 deletions(-)

diff --git a/include/ap_mmn.h b/include/ap_mmn.h
index acfa61e22b5..fb8f4512d47 100644
--- a/include/ap_mmn.h
+++ b/include/ap_mmn.h
@@ -733,6 +733,8 @@
  * 20211221.25 (2.5.1-dev) AP_SLASHES and AP_IS_SLASH
  * 20211221.26 (2.5.1-dev) Add AGAIN, ap_check_input_pending() and
  *                         ap_check_output_pending()
+ * 20211221.27 (2.5.1-dev) Add min_connection_timeout hook and
+ *                         ap_get_connection_timeout()
  */
 
 #define MODULE_MAGIC_COOKIE 0x41503235UL /* "AP25" */
@@ -740,7 +742,7 @@
 #ifndef MODULE_MAGIC_NUMBER_MAJOR
 #define MODULE_MAGIC_NUMBER_MAJOR 20211221
 #endif
-#define MODULE_MAGIC_NUMBER_MINOR 26             /* 0...n */
+#define MODULE_MAGIC_NUMBER_MINOR 27             /* 0...n */
 
 /**
  * Determine if the server's current MODULE_MAGIC_NUMBER is at least a
diff --git a/include/http_connection.h b/include/http_connection.h
index a89113bcb3b..601a4769109 100644
--- a/include/http_connection.h
+++ b/include/http_connection.h
@@ -196,6 +196,11 @@ AP_DECLARE(conn_rec *) ap_create_secondary_connection(apr_pool_t *pool,
                                                       conn_rec *master,
                                                       apr_bucket_alloc_t *alloc);
 
+AP_DECLARE_HOOK(int, min_connection_timeout,
+                (conn_rec *c, server_rec *s, apr_interval_time_t *min_timeout))
+
+AP_DECLARE(apr_interval_time_t) ap_get_connection_timeout(conn_rec *c,
+                                                          server_rec *s);
 
 /** End Of Connection (EOC) bucket */
 AP_DECLARE_DATA extern const apr_bucket_type_t ap_bucket_type_eoc;
diff --git a/modules/filters/mod_reqtimeout.c b/modules/filters/mod_reqtimeout.c
index 0e5afca57e4..693351e1280 100644
--- a/modules/filters/mod_reqtimeout.c
+++ b/modules/filters/mod_reqtimeout.c
@@ -45,6 +45,7 @@ typedef struct
     int max_timeout;        /* max timeout in secs */
     int min_rate;           /* min rate in bytes/s */
     apr_time_t rate_factor; /* scale factor (#usecs per min_rate) */
+    apr_interval_time_t server_timeout; /* server timeout at this stage */
 } reqtimeout_stage_t;
 
 typedef struct
@@ -59,6 +60,7 @@ typedef struct
 {
     apr_time_t timeout_at;
     apr_time_t max_timeout_at;
+    apr_interval_time_t time_left;
     reqtimeout_stage_t cur_stage;
     int in_keep_alive;
     char *type;
@@ -74,34 +76,45 @@ static int default_body_rate_factor;
 static void extend_timeout(reqtimeout_con_cfg *ccfg, apr_bucket_brigade *bb)
 {
     apr_off_t len;
+    apr_time_t old_timeout_at;
     apr_time_t new_timeout_at;
 
     if (apr_brigade_length(bb, 0, &len) != APR_SUCCESS || len <= 0)
         return;
 
-    new_timeout_at = ccfg->timeout_at + len * ccfg->cur_stage.rate_factor;
+    old_timeout_at = ccfg->timeout_at;
+    new_timeout_at = old_timeout_at + len * ccfg->cur_stage.rate_factor;
     if (ccfg->max_timeout_at > 0 && new_timeout_at > ccfg->max_timeout_at) {
         ccfg->timeout_at = ccfg->max_timeout_at;
     }
     else {
         ccfg->timeout_at = new_timeout_at;
     }
+
+    ccfg->time_left += new_timeout_at - old_timeout_at;
+    if (ccfg->time_left > ccfg->cur_stage.server_timeout) {
+        ccfg->time_left = ccfg->cur_stage.server_timeout;
+    }
 }
 
-static apr_status_t check_time_left(reqtimeout_con_cfg *ccfg,
-                                    apr_time_t *time_left_p,
-                                    apr_time_t now)
+static apr_status_t check_and_update_time_left(reqtimeout_con_cfg *ccfg,
+                                               apr_time_t now)
 {
     if (!now)
         now = apr_time_now();
-    *time_left_p = ccfg->timeout_at - now;
-    if (*time_left_p <= 0)
+
+    ccfg->time_left = ccfg->timeout_at - now;
+    if (ccfg->time_left <= 0)
         return APR_TIMEUP;
 
-    if (*time_left_p < apr_time_from_sec(1)) {
-        *time_left_p = apr_time_from_sec(1);
+    if (ccfg->time_left < apr_time_from_sec(1)) {
+        ccfg->time_left = apr_time_from_sec(1);
     }
-    return APR_SUCCESS;
+    else if (ccfg->time_left > ccfg->cur_stage.server_timeout) {
+        ccfg->time_left = ccfg->cur_stage.server_timeout;
+    }
+
+    return apr_socket_timeout_set(ccfg->socket, ccfg->time_left);
 }
 
 static apr_status_t have_lf_or_eos(apr_bucket_brigade *bb)
@@ -168,16 +181,14 @@ static apr_status_t brigade_append(apr_bucket_brigade *bbOut, apr_bucket_brigade
 }
 
 
-#define MIN(x,y) ((x) < (y) ? (x) : (y))
 static apr_status_t reqtimeout_filter(ap_filter_t *f,
                                       apr_bucket_brigade *bb,
                                       ap_input_mode_t mode,
                                       apr_read_type_e block,
                                       apr_off_t readbytes)
 {
-    apr_time_t time_left;
-    apr_time_t now = 0;
     apr_status_t rv;
+    apr_time_t now = 0;
     apr_interval_time_t saved_sock_timeout = UNSET;
     reqtimeout_con_cfg *ccfg = f->ctx;
 
@@ -198,11 +209,11 @@ static apr_status_t reqtimeout_filter(ap_filter_t *f,
         /* set new timeout */
         now = apr_time_now();
         ccfg->timeout_at = now + apr_time_from_sec(ccfg->cur_stage.timeout);
-        ccfg->cur_stage.timeout = 0;
         if (ccfg->cur_stage.max_timeout > 0) {
             ccfg->max_timeout_at = now + apr_time_from_sec(ccfg->cur_stage.max_timeout);
             ccfg->cur_stage.max_timeout = 0;
         }
+        ccfg->cur_stage.timeout = 0;
     }
     else if (ccfg->timeout_at == 0) {
         /* no timeout set, or in between requests */
@@ -213,39 +224,30 @@ static apr_status_t reqtimeout_filter(ap_filter_t *f,
         ccfg->socket = ap_get_conn_socket(f->c);
     }
 
-    rv = check_time_left(ccfg, &time_left, now);
-    if (rv != APR_SUCCESS)
-        goto out;
-
-    if (block == APR_NONBLOCK_READ || mode == AP_MODE_EATCRLF) {
-        rv = ap_get_brigade(f->next, bb, mode, block, readbytes);
-        if (ccfg->cur_stage.rate_factor && rv == APR_SUCCESS) {
-            extend_timeout(ccfg, bb);
-        }
-        return rv;
-    }
-
     rv = apr_socket_timeout_get(ccfg->socket, &saved_sock_timeout);
     AP_DEBUG_ASSERT(rv == APR_SUCCESS);
 
-    rv = apr_socket_timeout_set(ccfg->socket, MIN(time_left, saved_sock_timeout));
-    AP_DEBUG_ASSERT(rv == APR_SUCCESS);
+    rv = check_and_update_time_left(ccfg, now);
+    if (rv != APR_SUCCESS)
+        goto cleanup;
+
+    if (mode == AP_MODE_GETLINE && block == APR_BLOCK_READ) {
+        apr_off_t remaining = HUGE_STRING_LEN;
+#if APR_MAJOR_VERSION < 2
+        apr_int32_t nsds;
+        apr_interval_time_t poll_timeout;
+        apr_pollfd_t pollset;
+        pollset.p = NULL;
+#endif
 
-    if (mode == AP_MODE_GETLINE) {
         /*
          * For a blocking AP_MODE_GETLINE read, apr_brigade_split_line()
          * would loop until a whole line has been read. As this would make it
          * impossible to enforce a total timeout, we only do non-blocking
          * reads.
          */
-        apr_off_t remaining = HUGE_STRING_LEN;
         do {
             apr_off_t bblen;
-#if APR_MAJOR_VERSION < 2
-            apr_int32_t nsds;
-            apr_interval_time_t poll_timeout;
-            apr_pollfd_t pollset;
-#endif
 
             rv = ap_get_brigade(f->next, bb, AP_MODE_GETLINE, APR_NONBLOCK_READ, remaining);
             if (rv != APR_SUCCESS && !APR_STATUS_IS_EAGAIN(rv)) {
@@ -282,10 +284,12 @@ static apr_status_t reqtimeout_filter(ap_filter_t *f,
 
             /* ... and wait for more */
 #if APR_MAJOR_VERSION < 2
-            pollset.p = f->c->pool;
-            pollset.desc_type = APR_POLL_SOCKET;
-            pollset.reqevents = APR_POLLIN|APR_POLLHUP;
-            pollset.desc.s = ccfg->socket;
+            if (pollset.p == NULL) {
+                pollset.p = f->c->pool;
+                pollset.desc_type = APR_POLL_SOCKET;
+                pollset.reqevents = APR_POLLIN | APR_POLLHUP | APR_POLLERR;
+                pollset.desc.s = ccfg->socket;
+            }
             apr_socket_timeout_get(ccfg->socket, &poll_timeout);
             rv = apr_poll(&pollset, 1, &nsds, poll_timeout);
 #else
@@ -294,14 +298,10 @@ static apr_status_t reqtimeout_filter(ap_filter_t *f,
             if (rv != APR_SUCCESS)
                 break;
 
-            rv = check_time_left(ccfg, &time_left, 0);
+            rv = check_and_update_time_left(ccfg, 0);
             if (rv != APR_SUCCESS)
                 break;
 
-            rv = apr_socket_timeout_set(ccfg->socket,
-                                   MIN(time_left, saved_sock_timeout));
-            AP_DEBUG_ASSERT(rv == APR_SUCCESS);
-
         } while (1);
 
         if (ccfg->tmpbb)
@@ -310,19 +310,21 @@ static apr_status_t reqtimeout_filter(ap_filter_t *f,
     }
     else { /* mode != AP_MODE_GETLINE */
         rv = ap_get_brigade(f->next, bb, mode, block, readbytes);
+        
         /* Don't extend the timeout in speculative mode, wait for
          * the real (relevant) bytes to be asked later, within the
          * currently allotted time.
          */
-        if (ccfg->cur_stage.rate_factor && rv == APR_SUCCESS
-                && mode != AP_MODE_SPECULATIVE) {
+        if (rv == APR_SUCCESS
+                && mode != AP_MODE_SPECULATIVE
+                && ccfg->cur_stage.rate_factor) {
             extend_timeout(ccfg, bb);
         }
     }
 
+cleanup:
     apr_socket_timeout_set(ccfg->socket, saved_sock_timeout);
 
-out:
     if (APR_STATUS_IS_TIMEUP(rv)) {
         ap_log_cerror(APLOG_MARK, APLOG_INFO, 0, f->c, APLOGNO(01382)
                       "Request %s read timeout", ccfg->type);
@@ -353,7 +355,7 @@ static apr_status_t reqtimeout_eor(ap_filter_t *f, apr_bucket_brigade *bb)
     return ap_pass_brigade(f->next, bb);
 }
 
-#define INIT_STAGE(cfg, ccfg, stage) do { \
+#define INIT_STAGE(cfg, ccfg, stage, s_timeout) do { \
     if (cfg->stage.timeout != UNSET) { \
         ccfg->cur_stage.timeout     = cfg->stage.timeout; \
         ccfg->cur_stage.max_timeout = cfg->stage.max_timeout; \
@@ -364,6 +366,8 @@ static apr_status_t reqtimeout_eor(ap_filter_t *f, apr_bucket_brigade *bb)
         ccfg->cur_stage.max_timeout = MRT_DEFAULT_##stage##_MAX_TIMEOUT; \
         ccfg->cur_stage.rate_factor = default_##stage##_rate_factor; \
     } \
+    ccfg->cur_stage.server_timeout = s_timeout; \
+    ccfg->time_left = ccfg->cur_stage.timeout; \
 } while (0)
 
 static int reqtimeout_init(conn_rec *c)
@@ -392,7 +396,7 @@ static int reqtimeout_init(conn_rec *c)
 
         ccfg->type = "handshake";
         if (cfg->handshake.timeout > 0) {
-            INIT_STAGE(cfg, ccfg, handshake);
+            INIT_STAGE(cfg, ccfg, handshake, c->base_server->timeout);
         }
     }
 
@@ -422,7 +426,7 @@ static void reqtimeout_before_header(request_rec *r, conn_rec *c)
     ccfg->timeout_at = 0;
     ccfg->max_timeout_at = 0;
     ccfg->in_keep_alive = (c->keepalives > 0);
-    INIT_STAGE(cfg, ccfg, header);
+    INIT_STAGE(cfg, ccfg, header, c->base_server->timeout);
 }
 
 static int reqtimeout_before_body(request_rec *r)
@@ -447,11 +451,31 @@ static int reqtimeout_before_body(request_rec *r)
         ccfg->cur_stage.timeout = 0;
     }
     else {
-        INIT_STAGE(cfg, ccfg, body);
+        INIT_STAGE(cfg, ccfg, body, r->server->timeout);
     }
     return OK;
 }
 
+static int reqtimeout_min_timeout(conn_rec *c, server_rec *s/*unused*/,
+                                  apr_interval_time_t *min_timeout)
+{
+    reqtimeout_con_cfg *ccfg = ap_get_module_config(c->conn_config,
+                                                    &reqtimeout_module);
+    reqtimeout_stage_t *stage = &ccfg->cur_stage;
+
+    if (stage->timeout > 0 || ccfg->timeout_at) {
+        if (ccfg->time_left <= 0) {
+            *min_timeout = 0;
+        }
+        else if (*min_timeout < 0 || *min_timeout > ccfg->time_left) {
+            *min_timeout = ccfg->time_left;
+        }
+        return OK;
+    }
+
+    return DECLINED;
+}
+
 #define UNSET_STAGE(cfg, stage) do { \
     cfg->stage.timeout = UNSET; \
     cfg->stage.max_timeout = UNSET; \
@@ -637,6 +661,9 @@ static void reqtimeout_hooks(apr_pool_t *pool)
     ap_hook_post_read_request(reqtimeout_before_body, NULL, NULL,
                               APR_HOOK_MIDDLE);
 
+    ap_hook_min_connection_timeout(reqtimeout_min_timeout, NULL, NULL,
+                                   APR_HOOK_MIDDLE);
+
 #if MRT_DEFAULT_handshake_MIN_RATE
     default_handshake_rate_factor = apr_time_from_sec(1) /
                                     MRT_DEFAULT_handshake_MIN_RATE;
diff --git a/server/connection.c b/server/connection.c
index f32a1f3712c..a1c4c1860f0 100644
--- a/server/connection.c
+++ b/server/connection.c
@@ -36,6 +36,7 @@ APR_HOOK_STRUCT(
             APR_HOOK_LINK(pre_connection)
             APR_HOOK_LINK(pre_close_connection)
             APR_HOOK_LINK(create_secondary_connection)
+            APR_HOOK_LINK(min_connection_timeout)
 )
 AP_IMPLEMENT_HOOK_RUN_FIRST(conn_rec *,create_connection,
                             (apr_pool_t *p, server_rec *server, apr_socket_t *csd, long conn_id, void *sbh, apr_bucket_alloc_t *alloc),
@@ -46,6 +47,9 @@ AP_IMPLEMENT_HOOK_RUN_ALL(int,pre_close_connection,(conn_rec *c),(c),OK,DECLINED
 AP_IMPLEMENT_HOOK_RUN_FIRST(conn_rec *,create_secondary_connection,
                             (apr_pool_t *p, conn_rec *master, apr_bucket_alloc_t *alloc),
                             (p, master, alloc), NULL)
+AP_IMPLEMENT_HOOK_RUN_ALL(int,min_connection_timeout,
+                          (conn_rec *c, server_rec *s, apr_interval_time_t *min_timeout),
+                          (c, s, min_timeout),OK,DECLINED)
 
 AP_DECLARE(conn_rec *) ap_create_connection(apr_pool_t *p,
                                             server_rec *server,
@@ -251,3 +255,15 @@ AP_CORE_DECLARE(void) ap_process_connection(conn_rec *c, void *csd)
         ap_run_process_connection(c);
     }
 }
+
+AP_DECLARE(apr_interval_time_t) ap_get_connection_timeout(conn_rec *c,
+                                                          server_rec *s)
+{
+    apr_interval_time_t timeout = -1;
+
+    if (ap_run_min_connection_timeout(c, s, &timeout) != OK || timeout < 0) {
+        timeout = (s) ? s->timeout : c->base_server->timeout;
+    }
+
+    return timeout;
+}

From 8bddc079c906fb556f6507026d32a6c3b1dcaae7 Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Tue, 1 Feb 2022 17:25:48 +0100
Subject: [PATCH 09/22] mpm_event: Use ap_get_connection_timeout() for
 CONN_STATE_ASYNC_WAITIO.

If ap_run_process_connection() returns CONN_STATE_ASYNC_WAITIO and the
connection timeout as returned by ap_get_connection_timeout() is different
than the waitio_q timeout, use a timer event rather than the waitio_q to keep
track of the idle connection.

* server/mpm_fdqueue.h(truct timer_event_t):
  Add the "timeout" field to store the timeout of the timer, recomputing
  it from "when" would require to call apr_time_now() otherwise.

* server/mpm/event/event.c():
  #define TIMER_MIN_TIMEOUT as the minimal timer event's timeout, to
  prevent the events from firing before the sockets are added to the
  pollset. Currently set to 50ms (an arbitrary value..).

* server/mpm/event/event.c(struct event_conn_state_t):
  Add the timer_event_t *te field as an alternative to the q.

* server/mpm/event/event.c(struct event_srv_cfg_s):
  Add the server_rec *s field to backref the server_rec and easily pass
  cs->sc->s to ap_get_connection_timeout().

* server/mpm/event/event.c(pollset_add_at, pollset_del_at):
  If the connection is attached to a timer event, log a "t" instead of
  a "q" and the timer's timeout instead of the q's.

* server/mpm/event/event.c(process_socket):
  If ap_get_connection_timeout() is different than the waitio_q timeout,
  acquire a timer event and associate it with the conn_state. A timer
  event associated with a conn_state has a NULL callback (cbfn).

* server/mpm/event/event.c(event_get_timer_event):
  Set the given timeout to the ->timeout field.

* server/mpm/event/event.c(event_register_timed_callback_ex,
                           event_register_poll_callback_ex):
  Return APR_EINVAL if the given callbacks are NULL, this is reserved
  for conn_state timers now. Since it would have crashed at some point
  to pass NULL callbacks before, it's not really an API change.

* server/mpm/event/event.c(listener_thread):
  Fix the poll() timeout set from timers_next_expiry which should be
  taken into account whether it expired or not.
  When a conn_state timer fires/expires, remove it from the pollset and
  abort the connection (with APLOG_INFO).
  When a conn_state timer is polled, cancel the timer.
---
 server/mpm/event/event.c | 82 ++++++++++++++++++++++++++++++----------
 server/mpm_fdqueue.h     |  1 +
 2 files changed, 64 insertions(+), 19 deletions(-)

diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index b58fc50bd94..8ea061140c3 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -267,6 +267,8 @@ struct event_conn_state_t {
     apr_time_t queue_timestamp;
     /** the timeout queue for this entry */
     struct timeout_queue *q;
+    /** the timer event for this entry */
+    timer_event_t *te;
 
     /*
      * when queued to workers
@@ -646,6 +648,7 @@ struct event_srv_cfg_s {
     struct timeout_queue *io_q,
                          *wc_q,
                          *ka_q;
+    server_rec *s; /* backref */
 };
 
 #define ID_FROM_CHILD_THREAD(c, t)    ((c * thread_limit) + t)
@@ -1266,7 +1269,7 @@ static int event_post_read_request(request_rec *r)
 }
 
 static int pollset_add_at(event_conn_state_t *cs, int sense,
-                          struct timeout_queue *q,
+                          struct timeout_queue *q, timer_event_t *te,
                           const char *at, int line)
 {
     apr_status_t rv;
@@ -1275,11 +1278,11 @@ static int pollset_add_at(event_conn_state_t *cs, int sense,
                   "pollset: add %s=%" APR_TIME_T_FMT " events=%x"
                   " for connection %" CS_FMT " at %s:%i",
                   (q) ? "q" : "t",
-                  (q) ? q->timeout : -1,
+                  (q) ? q->timeout : (te) ? te->timeout : -1,
                   (int)cs->pfd.reqevents,
                   CS_ARG(cs), at, line);
 
-    ap_assert(cs->q == NULL && q != NULL);
+    ap_assert(cs->q == NULL && cs->te == NULL && ((q != NULL) ^ (te != NULL)));
 
     set_conn_state_sense(cs, sense);
 
@@ -1287,12 +1290,20 @@ static int pollset_add_at(event_conn_state_t *cs, int sense,
         apr_thread_mutex_lock(timeout_mutex);
         TO_QUEUE_APPEND(q, cs);
     }
+    else {
+        cs->te = te;
+    }
+
     rv = apr_pollset_add(event_pollset, &cs->pfd);
     if (rv != APR_SUCCESS) {
         if (q) {
             TO_QUEUE_REMOVE(q, cs);
             apr_thread_mutex_unlock(timeout_mutex);
         }
+        else {
+            te->canceled = 1;
+            cs->te = NULL;
+        }
 
         /* close_worker_sockets() may have closed it already */
         if (workers_may_exit) {
@@ -1312,8 +1323,8 @@ static int pollset_add_at(event_conn_state_t *cs, int sense,
     }
     return 1;
 }
-#define pollset_add(cs, sense, q) \
-    pollset_add_at((cs), (sense), (q), __FUNCTION__, __LINE__)
+#define pollset_add(cs, sense, q, te) \
+    pollset_add_at((cs), (sense), (q), (te), __FUNCTION__, __LINE__)
 
 static int pollset_del_at(event_conn_state_t *cs, int locked,
                           const char *at, int line)
@@ -1324,11 +1335,11 @@ static int pollset_del_at(event_conn_state_t *cs, int locked,
                   "pollset: del %s=%" APR_TIME_T_FMT " events=%x"
                   " for connection %" CS_FMT " at %s:%i",
                   (cs->q) ? "q" : "t",
-                  (cs->q) ? cs->q->timeout : -1,
+                  (cs->q) ? cs->q->timeout : (cs->te ? cs->te->timeout : -1),
                   (int)cs->pfd.reqevents,
                   CS_ARG(cs), at, line);
 
-    ap_assert(cs->q != NULL);
+    ap_assert((cs->q != NULL) ^ (cs->te != NULL));
 
     if (cs->q) {
         if (!locked) {
@@ -1339,6 +1350,10 @@ static int pollset_del_at(event_conn_state_t *cs, int locked,
             apr_thread_mutex_unlock(timeout_mutex);
         }
     }
+    else {
+        cs->te->canceled = 1;
+        cs->te = NULL;
+    }
 
     /*
      * Some of the pollset backends, like KQueue or Epoll
@@ -1362,6 +1377,10 @@ static int pollset_del_at(event_conn_state_t *cs, int locked,
     pollset_del_at((cs), (locked), __FUNCTION__, __LINE__)
 
 /* Forward declare */
+static timer_event_t *get_timer_event(apr_time_t timeout,
+                                      ap_mpm_callback_fn_t *cbfn, void *baton,
+                                      int insert,
+                                      apr_array_header_t *pfds);
 static void process_lingering_close(event_conn_state_t *cs);
 
 static event_conn_state_t *make_conn_state(apr_pool_t *p, apr_socket_t *csd)
@@ -1547,16 +1566,32 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
     }
 
     if (cs->pub.state == CONN_STATE_ASYNC_WAITIO) {
+        apr_interval_time_t timeout;
+        struct timeout_queue *q = NULL;
+        timer_event_t *te = NULL;
+
         /* Set a read/write timeout for this connection, and let the
          * event thread poll for read/writeability.
          */
         ap_update_child_status(cs->sbh, SERVER_BUSY_READ, NULL);
         notify_suspend(cs);
 
-        /* Modules might set c->cs->sense to CONN_SENSE_WANT_WRITE,
-         * the default is CONN_SENSE_WANT_READ still.
+        /* If the connection timeout is actually different than the waitio_q's,
+         * use a timer event to honor it (e.g. mod_reqtimeout may enforce its
+         * own timeouts per request stage).
          */
-        if (pollset_add(cs, CONN_SENSE_WANT_READ, cs->sc->io_q)) {
+        timeout = ap_get_connection_timeout(c, cs->sc->s);
+        if (timeout >= 0 && timeout != cs->sc->io_q->timeout) {
+            /* Prevent the timer from firing before the pollset is updated */
+            if (timeout < TIMERS_FUDGE_TIMEOUT) {
+                timeout = TIMERS_FUDGE_TIMEOUT;
+            }
+            te = get_timer_event(timeout, NULL, cs, 1, NULL);
+        }
+        else {
+            q = cs->sc->io_q;
+        }
+        if (!pollset_add(cs, CONN_SENSE_WANT_READ, q, te)) {
             apr_table_setn(cs->c->notes, "short-lingering-close", "1");
             cs->pub.state = CONN_STATE_LINGER;
             goto lingering_close;
@@ -1583,7 +1618,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
             /* Let the event thread poll for write */
             notify_suspend(cs);
             cs->pub.sense = CONN_SENSE_DEFAULT;
-            if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q)) {
+            if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q, NULL)) {
                 return; /* queued */
             }
             /* Fall through lingering close */
@@ -1620,7 +1655,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
          */
         notify_suspend(cs);
 
-        if (!pollset_add(cs, CONN_SENSE_WANT_READ, cs->ka_sc->ka_q)) {
+        if (!pollset_add(cs, CONN_SENSE_WANT_READ, cs->ka_sc->ka_q, NULL)) {
             apr_table_setn(cs->c->notes, "short-lingering-close", "1");
             cs->pub.state = CONN_STATE_LINGER;
             goto lingering_close;
@@ -1661,7 +1696,7 @@ static apr_status_t event_resume_suspended (conn_rec *c)
     cs->pub.sense = CONN_SENSE_DEFAULT;
     if (!CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) {
         cs->pub.state = CONN_STATE_WRITE_COMPLETION;
-        if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q)) {
+        if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q, NULL)) {
             return APR_SUCCESS; /* queued */
         }
 
@@ -1884,8 +1919,7 @@ static int timer_comp(void *a, void *b)
 static apr_thread_mutex_t *g_timer_skiplist_mtx;
 
 static timer_event_t *get_timer_event(apr_time_t timeout,
-                                      ap_mpm_callback_fn_t *cbfn,
-                                      void *baton,
+                                      ap_mpm_callback_fn_t *cbfn, void *baton,
                                       int insert,
                                       apr_array_header_t *pfds)
 {
@@ -1909,6 +1943,7 @@ static timer_event_t *get_timer_event(apr_time_t timeout,
     te->cbfunc = cbfn;
     te->baton = baton;
     te->when = now + timeout;
+    te->timeout = timeout;
     te->pfds = pfds;
 
     if (insert) {
@@ -2141,7 +2176,7 @@ static void process_lingering_close(event_conn_state_t *cs)
         struct timeout_queue *q;
         /* (Re)queue the connection to come back when readable */
         q = (cs->pub.state == CONN_STATE_LINGER_SHORT) ? short_linger_q : linger_q;
-        if (pollset_add(cs, CONN_SENSE_WANT_READ, q)) {
+        if (pollset_add(cs, CONN_SENSE_WANT_READ, q, NULL)) {
             return; /* queued */
         }
     }
@@ -2195,7 +2230,6 @@ static void process_timeout_queue(struct timeout_queue *q, apr_time_t expiry,
                 break;
             }
 
-            TO_QUEUE_REMOVE(qp, cs);
             if (!pollset_del(cs, 1)) {
                 kill_connection(cs, APR_EGENERAL);
                 continue;
@@ -2353,8 +2387,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             apr_thread_mutex_lock(g_timer_skiplist_mtx);
             while ((te = apr_skiplist_peek(timer_skiplist))) {
                 if (te->when > now) {
-                    timers_next_expiry = te->when;
-                    timeout = te->when - now;
                     break;
                 }
                 apr_skiplist_pop(timer_skiplist, NULL);
@@ -2364,6 +2396,17 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                     continue;
                 }
 
+                if (!te->cbfunc) {
+                    cs = te->baton;
+                    put_timer_event(te, 1);
+                    ap_assert(cs && cs->te == te);
+                    ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
+                                  "timed out connection %" CS_FMT, CS_ARG(cs));
+                    (void)pollset_del(cs, 0);
+                    kill_connection(cs, APR_TIMEUP);
+                    continue;
+                }
+
                 if (te->pfds) {
                     /* remove all sockets from the pollset */
                     apr_pool_cleanup_run(te->pfds->pool, te->pfds,
@@ -4417,6 +4460,7 @@ static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog,
     for (; s; s = s->next) {
         event_srv_cfg *sc = apr_pcalloc(pconf, sizeof *sc);
         ap_set_module_config(s->module_config, &mpm_event_module, sc);
+        sc->s = s; /* backref */
 
         sc->io_q = TO_QUEUE_CHAIN(pconf, "waitio", s->timeout,
                                   &waitio_q, io_h, ptemp);
diff --git a/server/mpm_fdqueue.h b/server/mpm_fdqueue.h
index 0dd558b938a..260e22ab80e 100644
--- a/server/mpm_fdqueue.h
+++ b/server/mpm_fdqueue.h
@@ -70,6 +70,7 @@ struct timer_event_t
     void *baton;
     int canceled;
     apr_array_header_t *pfds;
+    apr_interval_time_t timeout;
 };
 typedef struct timer_event_t timer_event_t;
 

From ae9a3b90f96939e38f2f92a73e34180b83e41c8a Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Mon, 3 Jun 2024 16:42:51 +0200
Subject: [PATCH 10/22] mpm_fdqueue: Allow to queue any events (socket, timer,
 opaque), and use that for mpm_event's backlog queue.

---
 include/scoreboard.h       |   1 +
 modules/lua/lua_request.c  |   4 +
 server/mpm/event/event.c   | 891 +++++++++++++++++++------------------
 server/mpm/worker/worker.c |   9 +-
 server/mpm_fdqueue.c       | 580 +++++++++++++++---------
 server/mpm_fdqueue.h       |  92 ++--
 6 files changed, 884 insertions(+), 693 deletions(-)

diff --git a/include/scoreboard.h b/include/scoreboard.h
index 25d19f03538..e83e52fdb16 100644
--- a/include/scoreboard.h
+++ b/include/scoreboard.h
@@ -149,6 +149,7 @@ struct process_score {
     apr_uint32_t keep_alive;        /* async connections in keep alive */
     apr_uint32_t suspended;         /* connections suspended by some module */
     apr_uint32_t wait_io;           /* async connections waiting an IO in the MPM */
+    apr_uint32_t backlog;           /* async connections waiting for a worker */
 };
 
 /* Scoreboard is now in 'local' memory, since it isn't updated once created,
diff --git a/modules/lua/lua_request.c b/modules/lua/lua_request.c
index 6787bbfaf7f..5fa3a968c6b 100644
--- a/modules/lua/lua_request.c
+++ b/modules/lua/lua_request.c
@@ -1248,6 +1248,10 @@ static int lua_ap_scoreboard_process(lua_State *L)
         lua_pushnumber(L, ps_record->connections);
         lua_settable(L, -3);
 
+        lua_pushstring(L, "backlog");
+        lua_pushnumber(L, ps_record->backlog);
+        lua_settable(L, -3);
+
         lua_pushstring(L, "keepalive");
         lua_pushnumber(L, ps_record->keep_alive);
         lua_settable(L, -3);
diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index 8ea061140c3..5a9f4b676b4 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -203,6 +203,7 @@ static volatile int workers_may_exit = 0;
 static volatile int start_thread_may_exit = 0;
 static volatile int listener_may_exit = 0;
 static apr_uint32_t connection_count = 0;   /* Number of open connections */
+static apr_uint32_t timers_count = 0;       /* Number of queued timers */
 static apr_uint32_t lingering_count = 0;    /* Number of connections in lingering close */
 static apr_uint32_t suspended_count = 0;    /* Number of suspended connections */
 static apr_uint32_t threads_shutdown = 0;   /* Number of threads that have shutdown
@@ -236,6 +237,14 @@ static apr_thread_mutex_t *timeout_mutex;
  * XXX: cases.
  */
 static apr_pollset_t *event_pollset;
+#define POLLSET_RESERVE_SIZE 10000
+
+struct backlog_timer_event {
+    timer_event_t te;
+    ap_queue_event_t qe;
+};
+#define te_qe(te) (&((struct backlog_timer_event *)(te))->qe)
+#define te_in_backlog(te) (te_qe(te)->cb != NULL)
 
 typedef struct event_conn_state_t event_conn_state_t;
 struct event_conn_state_t {
@@ -273,8 +282,12 @@ struct event_conn_state_t {
     /*
      * when queued to workers
      */
-    /** chaining in defer_linger_chain */
-    struct event_conn_state_t *chain;
+    /** the backlog event for this entry */
+    struct backlog_socket_event {
+        sock_event_t se;
+        ap_queue_event_t qe;
+        struct timeout_queue *q;
+    } bse;
 
     /*
      * bools as bits
@@ -290,6 +303,9 @@ struct event_conn_state_t {
         /** Has ap_start_lingering_close() been called? */
         linger_started  :1;
 };
+#define cs_se(cs) (&(cs)->bse.se)
+#define cs_qe(cs) (&(cs)->bse.qe)
+#define cs_in_backlog(cs) (cs_qe(cs)->cb != NULL)
 
 static APR_INLINE apr_socket_t *cs_sd(event_conn_state_t *cs)
 {
@@ -336,12 +352,6 @@ static APR_INLINE const char *cs_state_str(event_conn_state_t *cs)
 #define CS_FMT_TO       CS_FMT " to [%pI]"
 #define CS_ARG_TO(cs)   CS_ARG(cs), cs_raddr(cs)
 
-/*
- * The chain of connections to be shutdown by a worker thread (deferred),
- * linked list updated atomically.
- */
-static event_conn_state_t *volatile defer_linger_chain;
-
 #define USE_CLOCK_COARSE 0  /* not for now */
 #if HAVE_CLOCK_GETTIME && defined(CLOCK_MONOTONIC)            /* POSIX */
 static clockid_t event_clockid;
@@ -447,14 +457,15 @@ struct timeout_queue {
  *   keepalive_q        uses vhost's KeepAliveTimeOut
  *   linger_q           uses MAX_SECS_TO_LINGER
  *   short_linger_q     uses SECONDS_TO_LINGER
+ *   backlog_q          uses vhost's TimeOut
  */
 static struct timeout_queue *waitio_q,           /* wait for I/O to happen */
                             *write_completion_q, /* completion or user async poll */
                             *keepalive_q,        /* in between requests */
                             *linger_q,           /* lingering (read) before close */
-                            *short_linger_q;     /* lingering (read) before close (short timeout) */
-
-static volatile apr_time_t queues_next_expiry;  /* next expiry time accross all queues */
+                            *short_linger_q,     /* lingering (read) before close (short timeout) */
+                            *backlog_q;          /* waiting for a worker */
+static volatile apr_time_t queues_next_expiry; /* next expiry time accross all queues */
 
 /*
  * Macros for accessing struct timeout_queue.
@@ -584,7 +595,6 @@ typedef struct socket_callback_baton
     apr_array_header_t *pfds;
     timer_event_t *cancel_event; /* If a timeout was requested, a pointer to the timer event */
     struct socket_callback_baton *next;
-    unsigned int signaled :1;
 } socket_callback_baton_t;
 
 typedef struct event_child_bucket {
@@ -647,7 +657,8 @@ struct event_srv_cfg_s {
     /* Per server timeout queues */
     struct timeout_queue *io_q,
                          *wc_q,
-                         *ka_q;
+                         *ka_q,
+                         *bl_q;
     server_rec *s; /* backref */
 };
 
@@ -696,25 +707,34 @@ static int ap_child_slot;       /* Current child process slot in scoreboard */
  */
 static apr_socket_t **worker_sockets;
 
-static volatile apr_uint32_t listensocks_disabled;
+/* Disabling / enabling listening sockets can only happen in the listener
+ * thread, which is the only one to set 'dying' to 1 too, so it's all thread
+ * safe. 'listensocks_off' is changed atomically still because it's read
+ * concurrently in listensocks_disabled().
+ */
+static /*atomic*/ apr_uint32_t listensocks_off = 0;
 
-static void disable_listensocks(void)
+static int disable_listensocks(void)
 {
     int i;
-    if (apr_atomic_cas32(&listensocks_disabled, 1, 0) != 0) {
-        return;
+
+    if (apr_atomic_cas32(&listensocks_off, 1, 0) != 0) {
+        return 0;
     }
 
     ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(10381)
-                 "Suspend listening sockets: idlers:%i conns:%u "
-                 "waitio:%u write:%u keepalive:%u linger:%u/%u suspended:%u",
-                 ap_queue_info_num_idlers(worker_queue_info),
+                 "Suspend listening sockets: idlers:%i conns:%u backlog:%u "
+                 "waitio:%u write:%u keepalive:%u linger:%u/%u "
+                 "timers:%u suspended:%u",
+                 ap_queue_info_idlers_count(worker_queue_info),
                  apr_atomic_read32(&connection_count),
+                 apr_atomic_read32(backlog_q->total),
                  apr_atomic_read32(waitio_q->total),
                  apr_atomic_read32(write_completion_q->total),
                  apr_atomic_read32(keepalive_q->total),
                  apr_atomic_read32(linger_q->total),
                  apr_atomic_read32(short_linger_q->total),
+                 apr_atomic_read32(&timers_count),
                  apr_atomic_read32(&suspended_count));
 
     ap_scoreboard_image->parent[ap_child_slot].not_accepting = 1;
@@ -722,26 +742,31 @@ static void disable_listensocks(void)
     for (i = 0; i < num_listensocks; i++) {
         apr_pollset_remove(event_pollset, &listener_pollfd[i]);
     }
+    return 1;
 }
 
-static void enable_listensocks(void)
+static int enable_listensocks(void)
 {
     int i;
+
     if (listener_may_exit
-        || apr_atomic_cas32(&listensocks_disabled, 0, 1) != 1) {
-        return;
+        || apr_atomic_cas32(&listensocks_off, 0, 1) != 1) {
+        return 0;
     }
 
     ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(00457)
-                 "Resume listening sockets: idlers:%i conns:%u "
-                 "waitio:%u write:%u keepalive:%u linger:%u/%u suspended:%u",
-                 ap_queue_info_num_idlers(worker_queue_info),
+                 "Resume listening sockets: idlers:%i conns:%u backlog:%u "
+                 "waitio:%u write:%u keepalive:%u linger:%u/%u "
+                 "timers:%u suspended:%u",
+                 ap_queue_info_idlers_count(worker_queue_info),
                  apr_atomic_read32(&connection_count),
+                 apr_atomic_read32(backlog_q->total),
                  apr_atomic_read32(waitio_q->total),
                  apr_atomic_read32(write_completion_q->total),
                  apr_atomic_read32(keepalive_q->total),
                  apr_atomic_read32(linger_q->total),
                  apr_atomic_read32(short_linger_q->total),
+                 apr_atomic_read32(&timers_count),
                  apr_atomic_read32(&suspended_count));
 
     /*
@@ -753,23 +778,24 @@ static void enable_listensocks(void)
     for (i = 0; i < num_listensocks; i++) {
         apr_pollset_add(event_pollset, &listener_pollfd[i]);
     }
+    return 1;
 }
 
-static APR_INLINE apr_uint32_t listeners_disabled(void)
+static APR_INLINE int listensocks_disabled(void)
 {
-    return apr_atomic_read32(&listensocks_disabled);
+    return apr_atomic_read32(&listensocks_off) != 0;
 }
 
 static APR_INLINE int connections_above_limit(int *busy)
 {
-    apr_uint32_t i_count = ap_queue_info_num_idlers(worker_queue_info);
+    apr_int32_t i_count = ap_queue_info_idlers_count(worker_queue_info);
     if (i_count > 0) {
         apr_uint32_t c_count = apr_atomic_read32(&connection_count);
         apr_uint32_t l_count = apr_atomic_read32(&lingering_count);
         if (c_count <= l_count
-                /* Off by 'listeners_disabled()' to avoid flip flop */
+                /* Off by 'listensocks_disabled()' to avoid flip flop */
                 || c_count - l_count < (apr_uint32_t)threads_per_child +
-                                       (i_count - listeners_disabled()) *
+                                       (i_count - listensocks_disabled()) *
                                        (worker_factor / WORKER_FACTOR_SCALE)) {
             return 0;
         }
@@ -782,7 +808,7 @@ static APR_INLINE int connections_above_limit(int *busy)
 
 static APR_INLINE int should_enable_listensocks(void)
 {
-    return !dying && listeners_disabled() && !connections_above_limit(NULL);
+    return !dying && listensocks_disabled() && !connections_above_limit(NULL);
 }
 
 static void close_socket_at(apr_socket_t *csd,
@@ -1101,36 +1127,6 @@ static void notify_resume(event_conn_state_t *cs, int cleanup)
     ap_run_resume_connection(cs->c, cs->r);
 }
 
-/*
- * Defer flush and close of the connection by adding it to defer_linger_chain,
- * for a worker to grab it and do the job (should that be blocking).
- * Pre-condition: nonblocking, can be called from anywhere provided cs is not
- *                in any timeout queue or in the pollset.
- */
-static int defer_lingering_close(event_conn_state_t *cs)
-{
-    ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
-                  "deferring close for connection %" CS_FMT, CS_ARG(cs));
-
-    /* The connection is not shutdown() yet strictly speaking, but it's not
-     * in any queue nor handled by a worker either (will be very soon), so
-     * to account for it somewhere we bump lingering_count now (and set
-     * deferred_linger for process_lingering_close() to know).
-     */
-    cs->pub.state = CONN_STATE_LINGER;
-    apr_atomic_inc32(&lingering_count);
-    cs->deferred_linger = 1;
-    for (;;) {
-        event_conn_state_t *chain = cs->chain = defer_linger_chain;
-        if (apr_atomic_casptr((void *)&defer_linger_chain, cs,
-                              chain) != chain) {
-            /* Race lost, try again */
-            continue;
-        }
-        return 1;
-    }
-}
-
 /* Close the connection and release its resources (ptrans), either because an
  * unrecoverable error occured (queues or pollset add/remove) or more usually
  * if lingering close timed out.
@@ -1178,23 +1174,53 @@ static void kill_connection_at(event_conn_state_t *cs, apr_status_t status,
 
 /* forward declare */
 static void set_conn_state_sense(event_conn_state_t *cs, int sense);
+static void push2worker(event_conn_state_t *cs, timer_event_t *te,
+                        apr_time_t now, int *busy);
 
 /* Shutdown the connection in case of timeout, error or resources shortage.
  * This starts short lingering close if not already there, or directly closes
  * the connection otherwise.
  * Pre-condition: nonblocking, can be called from anywhere provided cs is not
- *                in any timeout queue or in the pollset.
+ *                in the pollset nor any non-backlog timeout queue.
  */
-static int shutdown_connection(event_conn_state_t *cs)
+static void shutdown_connection(event_conn_state_t *cs, apr_time_t now,
+                                int in_backlog)
 {
-    if (!CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) {
-        apr_table_setn(cs->c->notes, "short-lingering-close", "1");
-        defer_lingering_close(cs);
+    ap_assert(!cs->q && !cs->te);
+
+    if (cs->c) {
+        int log_level = APLOG_INFO;
+        switch (cs->pub.state) {
+        case CONN_STATE_LINGER:
+        case CONN_STATE_LINGER_NORMAL:
+        case CONN_STATE_LINGER_SHORT:
+        case CONN_STATE_KEEPALIVE:
+            log_level = APLOG_TRACE2;
+        default:
+            break;
+        }
+        ap_log_cerror(APLOG_MARK, log_level, 0, cs->c, APLOGNO(10380)
+                      "shutting down %s connection in %s",
+                      in_backlog ? "backlog" : "timed out",
+                      cs_state_str(cs));
+
+        /* Don't re-schedule connections in lingering close, they had
+         * their chance already so just close them now.
+         */
+        if (!CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) {
+            apr_table_setn(cs->c->notes, "short-lingering-close", "1");
+            cs->pub.state = CONN_STATE_LINGER;
+            push2worker(cs, NULL, now, NULL);
+        }
+        else {
+            close_connection(cs);
+        }
     }
     else {
-        close_connection(cs);
+        /* Never been scheduled/processed, kill it. */
+        ap_assert(in_backlog);
+        kill_connection(cs, APR_EBUSY);
     }
-    return 1;
 }
 
 /*
@@ -1388,10 +1414,13 @@ static event_conn_state_t *make_conn_state(apr_pool_t *p, apr_socket_t *csd)
     event_conn_state_t *cs = apr_pcalloc(p, sizeof(*cs));
     listener_poll_type *pt;
 
-    cs->p = p;
-    cs->pfd.desc.s = csd;
     cs->pfd.desc_type = APR_POLL_SOCKET;
+    cs->pfd.desc.s = cs_se(cs)->sd = csd;
     cs->pfd.client_data = pt = apr_pcalloc(p, sizeof(*pt));
+    cs_qe(cs)->cb_baton = cs_se(cs)->baton = cs;
+    cs_qe(cs)->type = AP_QUEUE_EVENT_SOCK;
+    cs_qe(cs)->data.se = cs_se(cs);
+    cs->p = cs_se(cs)->p = p;
     pt->type = PT_CSD;
     pt->baton = cs;
 
@@ -1814,85 +1843,128 @@ static void init_serf(apr_pool_t *p)
 }
 #endif
 
-static apr_status_t push_timer2worker(timer_event_t* te)
+/* A backlog connection is both in the worker_queue (for a worker to pull
+ * it ASAP) and in the backlog_q (for the listener to enforce a timeout).
+ * The worker_queue can do the queuing on both queues for us, that is
+ * consistently and safely push/pop to/from both queues under its lock,
+ * thanks to a callback called when an event is pushed and popped.
+ */
+static void conn_state_backlog_cb(void *baton, int pushed)
 {
-    return ap_queue_push_timer(worker_queue, te);
+    event_conn_state_t *cs = baton;
+
+    if (pushed) {
+        TO_QUEUE_APPEND(cs->sc->bl_q, cs);
+    }
+    else { /* popped */
+        TO_QUEUE_REMOVE(cs->sc->bl_q, cs);
+
+        /* not in backlog anymore */
+        cs_qe(cs)->cb = NULL;
+    }
 }
 
-/*
- * Pre-condition: cs is neither in event_pollset nor a timeout queue
- * this function may only be called by the listener
- */
-static apr_status_t push2worker(event_conn_state_t *cs, apr_socket_t *csd,
-                                apr_pool_t *ptrans)
+static void timer_event_backlog_cb(void *baton, int pushed)
 {
-    apr_status_t rc;
+    timer_event_t *te = baton;
+    ap_assert(te && te_qe(te));
 
-    if (cs) {
-        ptrans = cs->p;
-        csd = cs_sd(cs);
+    if (pushed) {
+        apr_atomic_inc32(&timers_count);
     }
+    else { /* popped */
+        apr_atomic_dec32(&timers_count);
 
-    rc = ap_queue_push_socket(worker_queue, csd, cs, ptrans);
-    if (rc != APR_SUCCESS) {
-        ap_log_error(APLOG_MARK, APLOG_CRIT, rc, ap_server_conf, APLOGNO(00471)
-                     "push2worker: ap_queue_push_socket failed");
-        /* trash the connection; we couldn't queue the connected
-         * socket to a worker
-         */
-        if (cs) {
-            kill_connection(cs, rc);
-        }
-        else {
-            if (csd) {
-                close_socket(csd);
-            }
-            if (ptrans) {
-                ap_queue_info_push_pool(worker_queue_info, ptrans);
-            }
-        }
-        signal_threads(ST_GRACEFUL);
+        /* not in backlog anymore */
+        te_qe(te)->cb = NULL;
     }
-
-    return rc;
 }
 
-/* get_worker:
- *     If *have_idle_worker_p == 0, reserve a worker thread, and set
- *     *have_idle_worker_p = 1.
- *     If *have_idle_worker_p is already 1, will do nothing.
- *     If blocking == 1, block if all workers are currently busy.
- *     If no worker was available immediately, will set *all_busy to 1.
- *     XXX: If there are no workers, we should not block immediately but
- *     XXX: close all keep-alive connections first.
+/*
+ * Pre-condition: cs is neither in event_pollset nor a queue
+ * this function may only be called by the listener
  */
-static void get_worker(int *have_idle_worker_p, int blocking, int *all_busy)
+static void push2worker(event_conn_state_t *cs, timer_event_t *te,
+                        apr_time_t now, int *above_limit)
 {
+    ap_queue_event_t *qe;
     apr_status_t rc;
+    int busy;
+
+    ap_assert((cs != NULL) ^ (te != NULL));
 
-    if (*have_idle_worker_p) {
-        /* already reserved a worker thread - must have hit a
-         * transient error on a previous pass
+    busy = (ap_queue_info_idlers_dec(worker_queue_info) < 0);
+    if (busy) {
+        /* Might need to kindle the fire by not accepting new connections until
+         * the situation settles down. The listener and new idling workers will
+         * test for should_enable_listensocks() to recover (when suitable).
          */
-        return;
+        if (connections_above_limit(NULL)) {
+            disable_listensocks();
+            if (above_limit) {
+                *above_limit = 1;
+            }
+        }
     }
 
-    if (blocking)
-        rc = ap_queue_info_wait_for_idler(worker_queue_info, all_busy);
-    else
-        rc = ap_queue_info_try_get_idler(worker_queue_info);
+    if (te) {
+        ap_assert(!te_in_backlog(te));
 
-    if (rc == APR_SUCCESS || APR_STATUS_IS_EOF(rc)) {
-        *have_idle_worker_p = 1;
-    }
-    else if (!blocking && rc == APR_EAGAIN) {
-        *all_busy = 1;
+        qe = te_qe(te);
+        qe->cb = timer_event_backlog_cb;
     }
     else {
-        ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf, APLOGNO(00472)
-                     "ap_queue_info_wait_for_idler failed.  "
-                     "Attempting to shutdown process gracefully");
-        signal_threads(ST_GRACEFUL);
+        ap_assert(!cs_in_backlog(cs));
+        ap_assert(!cs->q);
+
+        if (busy && cs->pub.state == CONN_STATE_LINGER && cs->linger_started) {
+            /* Not worth lingering more on this connection if we are short of
+             * workers and everything is flushed+shutdown already, back out
+             * and close.
+             */
+            ap_queue_info_idlers_inc(worker_queue_info);
+            close_connection(cs);
+            return;
+        }
+
+        if (cs->c) {
+            ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
+                          "pushing connection %" CS_FMT,
+                          CS_ARG(cs));
+        }
+        else {
+            ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf,
+                          "pushing connection %" CS_FMT_TO,
+                          CS_ARG_TO(cs));
+        }
+
+        qe = cs_qe(cs);
+        qe->cb = conn_state_backlog_cb;
+    }
+
+    rc = ap_queue_push_event(worker_queue, qe);
+    if (rc != APR_SUCCESS) {
+        int mode = ST_GRACEFUL;
+
+        ap_queue_info_idlers_inc(worker_queue_info);
+
+        ap_log_error(APLOG_MARK, APLOG_CRIT, rc, ap_server_conf, APLOGNO(00471)
+                     "push2worker: queuing %s failed", cs ? "socket" : "timer");
+
+        if (cs) {
+            /* Can't go anywhere, kill (and log). */
+            kill_connection(cs, rc);
+        }
+        else {
+            /* Can't call te->cbfunc() and potentially block there, someone is
+             * going to miss this event thus never release their connection(s),
+             * graceful stop could never complete.
+             */
+            mode = ST_UNGRACEFUL;
+        }
+
+        AP_DEBUG_ASSERT(0);
+        signal_threads(mode);
     }
 }
 
@@ -1935,8 +2007,13 @@ static timer_event_t *get_timer_event(apr_time_t timeout,
         APR_RING_REMOVE(te, link);
     }
     else {
-        te = apr_skiplist_alloc(timer_skiplist, sizeof(timer_event_t));
-        memset(te, 0, sizeof(*te));
+        struct backlog_timer_event *bte;
+        /* invariant: (te == &bte->te) => (te_qe(te) == &bte->qe) */
+        bte = apr_skiplist_alloc(timer_skiplist, sizeof(*bte));
+        memset(bte, 0, sizeof(*bte));
+        bte->qe.type = AP_QUEUE_EVENT_TIMER;
+        bte->qe.data.te = bte->qe.cb_baton = &bte->te;
+        te = &bte->te;
     }
 
     APR_RING_ELEM_INIT(te, link);
@@ -2123,14 +2200,11 @@ static void process_lingering_close(event_conn_state_t *cs)
                   CS_ARG(cs));
     AP_DEBUG_ASSERT(CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state));
 
+    /* Flush and shutdown first */
     if (!cs->linger_started) {
+        cs->linger_started = 1; /* once! */
+        apr_atomic_inc32(&lingering_count);
         cs->pub.state = CONN_STATE_LINGER;
-        cs->linger_started = 1;
-
-        /* defer_lingering_close() may have bumped lingering_count already */
-        if (!cs->deferred_linger) {
-            apr_atomic_inc32(&lingering_count);
-        }
 
         apr_socket_timeout_set(csd, apr_time_from_sec(SECONDS_TO_LINGER));
         if (ap_start_lingering_close(cs->c)) {
@@ -2157,24 +2231,17 @@ static void process_lingering_close(event_conn_state_t *cs)
             cs->pub.state = CONN_STATE_LINGER_NORMAL;
         }
         cs->pub.sense = CONN_SENSE_DEFAULT;
-
-        /* One timestamp/duration for the whole lingering close time.
-         * XXX: This makes the (short_)linger_q not sorted/ordered by expiring
-         * timeouts whenever multiple schedules are necessary (EAGAIN below),
-         * but we probabaly don't care since these connections do not count
-         * for connections_above_limit() and all of them will be killed when
-         * busy or gracefully stopping anyway.
-         */
-        cs->queue_timestamp = apr_time_now();
     }
 
+    /* Drain until EAGAIN or EOF/error, in the former case requeue and
+     * come back when readable again, otherwise the connection is over.
+     */
     do {
         apr_size_t nbytes = sizeof(dummybuf);
         rv = apr_socket_recv(csd, dummybuf, &nbytes);
     } while (rv == APR_SUCCESS);
     if (APR_STATUS_IS_EAGAIN(rv)) {
         struct timeout_queue *q;
-        /* (Re)queue the connection to come back when readable */
         q = (cs->pub.state == CONN_STATE_LINGER_SHORT) ? short_linger_q : linger_q;
         if (pollset_add(cs, CONN_SENSE_WANT_READ, q, NULL)) {
             return; /* queued */
@@ -2183,108 +2250,92 @@ static void process_lingering_close(event_conn_state_t *cs)
     close_connection(cs);
 }
 
-/* call 'func' for all elements of 'q' above 'expiry'.
+/* Call shutdown_connection() for the elements of 'q' that timed out, or
+ * for all if 'shrink' is set.
  * Pre-condition: timeout_mutex must already be locked
- * Post-condition: timeout_mutex will be locked again
  */
-static void process_timeout_queue(struct timeout_queue *q, apr_time_t expiry,
-                                  int (*func)(event_conn_state_t *))
+static unsigned int process_timeout_queue_ex(struct timeout_queue *queue,
+                                             apr_time_t now,
+                                             int shrink)
 {
-    apr_uint32_t total = 0, count;
-    event_conn_state_t *first, *cs, *last;
-    struct event_conn_state_t trash;
-    struct timeout_queue *qp;
+    unsigned int count = 0;
+    struct timeout_queue *q;
 
-    if (!*q->total) {
-        return;
+    if (!*queue->total) {
+        return 0;
     }
 
-    APR_RING_INIT(&trash.timeout_list, event_conn_state_t, timeout_list);
-    for (qp = q; qp; qp = qp->next) {
-        count = 0;
-        cs = first = last = APR_RING_FIRST(&qp->head);
-        while (cs != APR_RING_SENTINEL(&qp->head, event_conn_state_t,
-                                       timeout_list)) {
-            /* Trash the entry if:
-             * - no expiry was given (zero means all), or
-             * - it expired (according to the queue timeout), or
-             * - the system clock skewed in the past: no entry should be
-             *   registered above the given expiry (~now) + the queue
-             *   timeout, we won't keep any here (eg. for centuries).
-             *
-             * Otherwise stop, no following entry will match thanks to the
-             * single timeout per queue (entries are added to the end!).
-             * This allows maintenance in O(1).
-             */
-            if (expiry && cs->queue_timestamp + qp->timeout > expiry
-                       && cs->queue_timestamp < expiry + qp->timeout) {
-                /* Since this is the next expiring entry of this queue, update
-                 * the global queues_next_expiry if it's later than this one.
+    for (q = queue; q; q = q->next) {
+        while (!APR_RING_EMPTY(&q->head, event_conn_state_t, timeout_list)) {
+            event_conn_state_t *cs = APR_RING_FIRST(&q->head);
+
+            ap_assert(cs->q == q);
+
+            if (!shrink) {
+                /* Stop if this entry did not expire, no following one will
+                 * thanks to the single timeout per queue (latest entries are
+                 * added to the tail).
                  */
-                apr_time_t elem_expiry = cs->queue_timestamp + qp->timeout;
-                apr_time_t next_expiry = queues_next_expiry;
-                if (!next_expiry
+                apr_time_t elem_expiry = cs->queue_timestamp + q->timeout;
+                if (elem_expiry > now) {
+                    /* This is the next expiring entry of this queue, update
+                     * the global queues_next_expiry if it expires after
+                     * this one.
+                     */
+                    apr_time_t next_expiry = queues_next_expiry;
+                    if (!next_expiry
                         || next_expiry > elem_expiry + QUEUES_FUDGE_TIMEOUT) {
-                    queues_next_expiry = elem_expiry;
+                        queues_next_expiry = elem_expiry;
+                    }
+                    break;
                 }
-                break;
             }
 
-            if (!pollset_del(cs, 1)) {
-                kill_connection(cs, APR_EGENERAL);
-                continue;
+            if (cs_in_backlog(cs)) {
+                /* Remove the backlog connection from worker_queue (note that
+                 * the lock is held by the listener already when maintaining
+                 * the backlog_q), and unreserve/set a worker/idler since
+                 * none could handle the event.
+                 */
+                ap_assert(cs_qe(cs)->cb_baton == cs);
+                ap_assert(cs->q == cs->sc->bl_q);
+                ap_queue_info_idlers_inc(worker_queue_info);
+                ap_queue_kill_event_locked(worker_queue, cs_qe(cs));
+                shutdown_connection(cs, now, 1);
             }
-
-            if (cs == first) {
-                APR_RING_INSERT_HEAD(&qp->head, cs, event_conn_state_t,
-                                     timeout_list);
+            else if (pollset_del(cs, 1)) {
+                /* Removed from the pollset and timeout queue. */
+                shutdown_connection(cs, now, 0);
             }
             else {
-                APR_RING_INSERT_AFTER(last, cs, timeout_list);
+                /* Can't go anywhere, kill (and log). */
+                kill_connection(cs, APR_EGENERAL);
             }
-            ++*qp->total;
-            ++qp->count;
 
-            last = cs;
-            cs = APR_RING_NEXT(cs, timeout_list);
             count++;
         }
-        if (!count)
-            continue;
-
-        APR_RING_UNSPLICE(first, last, timeout_list);
-        APR_RING_SPLICE_TAIL(&trash.timeout_list, first, last, event_conn_state_t,
-                             timeout_list);
-        AP_DEBUG_ASSERT(*q->total >= count && qp->count >= count);
-        *q->total -= count;
-        qp->count -= count;
-        total += count;
     }
-    if (!total)
-        return;
 
-    apr_thread_mutex_unlock(timeout_mutex);
-    first = APR_RING_FIRST(&trash.timeout_list);
-    do {
-        cs = APR_RING_NEXT(first, timeout_list);
-        APR_RING_ELEM_INIT(cs, timeout_list);
-        func(first);
-        first = cs;
-    } while (--total);
-    apr_thread_mutex_lock(timeout_mutex);
+    return count;
 }
 
-static void process_keepalive_queue(apr_time_t expiry)
+static APR_INLINE void process_timeout_queue(struct timeout_queue *queue,
+                                             apr_time_t now)
 {
-    /* If all workers are busy, we kill older keep-alive connections so
-     * that they may connect to another process.
-     */
-    if (!expiry && *keepalive_q->total) {
+    (void)process_timeout_queue_ex(queue, now, 0);
+}
+
+/* When all workers are busy or dying, kill'em all \m/ */
+static APR_INLINE void shrink_timeout_queue(struct timeout_queue *queue,
+                                            apr_time_t now)
+{
+    unsigned int count = process_timeout_queue_ex(queue, now, 1);
+    if (count) {
         ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf,
-                     "All workers are busy or dying, will shutdown %u "
-                     "keep-alive connections", *keepalive_q->total);
+                     "All workers are %s, %s queue shrinked (%u done, %u left)",
+                     dying ? "dying" : "busy", queue->name,
+                     count, apr_atomic_read32(queue->total));
     }
-    process_timeout_queue(keepalive_q, expiry, shutdown_connection);
 }
 
 static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
@@ -2293,7 +2344,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
     proc_info *ti = dummy;
     int process_slot = ti->pslot;
     process_score *ps = ap_get_scoreboard_process(process_slot);
-    int have_idle_worker = 0;
     apr_time_t last_log;
 
     last_log = event_time_now();
@@ -2316,7 +2366,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
         int workers_were_busy = 0;
         socket_callback_baton_t *user_chain;
         const apr_pollfd_t *out_pfd;
-        apr_time_t now;
+        apr_time_t now, poll_time;
         event_conn_state_t *cs;
         timer_event_t *te;
 
@@ -2325,6 +2375,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             check_infinite_requests();
         }
 
+        now = poll_time = event_time_now();
+
         if (listener_may_exit) {
             int once = !dying;
             if (once) {
@@ -2332,7 +2384,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             }
 
             if (terminate_mode == ST_UNGRACEFUL
-                || apr_atomic_read32(&connection_count) == 0)
+                || (apr_atomic_read32(&connection_count) == 0
+                    && apr_atomic_read32(&timers_count) == 0))
                 break;
 
             if (once) {
@@ -2345,7 +2398,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
         }
 
         if (APLOGtrace6(ap_server_conf)) {
-            now = event_time_now();
             /* trace log status every second */
             if (now - last_log > apr_time_from_sec(1)) {
                 ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf,
@@ -2376,7 +2428,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
          * up occurs, otherwise periodic checks (maintenance, shutdown, ...)
          * must be performed.
          */
-        now = event_time_now();
         timeout = -1;
 
         /* Push expired timers to a worker, the first remaining one (if any)
@@ -2401,7 +2452,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                     put_timer_event(te, 1);
                     ap_assert(cs && cs->te == te);
                     ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
-                                  "timed out connection %" CS_FMT, CS_ARG(cs));
+                                  "timed out connection %" CS_FMT,
+                                  CS_ARG(cs));
                     (void)pollset_del(cs, 0);
                     kill_connection(cs, APR_TIMEUP);
                     continue;
@@ -2412,7 +2464,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                     apr_pool_cleanup_run(te->pfds->pool, te->pfds,
                                          event_cleanup_poll_callback);
                 }
-                push_timer2worker(te);
+                push2worker(NULL, te, now, &workers_were_busy);
             }
             if (te) {
                 next_expiry = te->when;
@@ -2453,13 +2505,14 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
         }
 
         ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf,
-                     "pollset: wait for timeout=%" APR_TIME_T_FMT
+                     "pollset: wait timeout=%" APR_TIME_T_FMT
                      " queues_timeout=%" APR_TIME_T_FMT
                      " timers_timeout=%" APR_TIME_T_FMT
-                     " conns=%d exit=%d/%d",
+                     " listen=%s conns=%d exit=%d/%d",
                      timeout,
-                     queues_next_expiry ? queues_next_expiry - now : -1,
-                     timers_next_expiry ? timers_next_expiry - now : -1,
+                     queues_next_expiry ? queues_next_expiry - now : 0,
+                     timers_next_expiry ? timers_next_expiry - now : 0,
+                     listensocks_disabled() ? "no" : "yes",
                      apr_atomic_read32(&connection_count),
                      listener_may_exit, dying);
 
@@ -2476,34 +2529,36 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             num = 0;
         }
 
-        if (APLOGtrace7(ap_server_conf)) {
-            apr_time_t old_now = now;
-            now = event_time_now();
-
-            ap_log_error(APLOG_MARK, APLOG_TRACE7, rc, ap_server_conf,
-                         "pollset: have #%i time=%" APR_TIME_T_FMT "/%" APR_TIME_T_FMT
-                         " queues_timeout=%" APR_TIME_T_FMT
-                         " timers_timeout=%" APR_TIME_T_FMT
-                         " conns=%d exit=%d/%d",
-                         (int)num, now - old_now, timeout,
-                         queues_next_expiry ? queues_next_expiry - now : -1,
-                         timers_next_expiry ? timers_next_expiry - now : -1,
-                         apr_atomic_read32(&connection_count),
-                         listener_may_exit, dying);
-        }
-
-        /* XXX possible optimization: stash the current time for use as
-         * r->request_time for new requests or queues maintenance
+        /* Update "now" after polling and use it for everything below (all
+         * non-(indefinitely-)blocking code). "now - poll_time" is then the
+         * time passed in poll().
+         *
+         * XXX possible optimization: stash this time for use as
+         * r->request_time for new requests.
          */
+        now = event_time_now();
+
+        ap_log_error(APLOG_MARK, APLOG_TRACE7, rc, ap_server_conf,
+                     "pollset: have num=%i"
+                     " elapsed=%" APR_TIME_T_FMT "/%" APR_TIME_T_FMT
+                     " queues_timeout=%" APR_TIME_T_FMT
+                     " timers_timeout=%" APR_TIME_T_FMT
+                     " listen=%s conns=%d exit=%d/%d",
+                     (int)num, now - poll_time, timeout,
+                     queues_next_expiry ? queues_next_expiry - now : 0,
+                     timers_next_expiry ? timers_next_expiry - now : 0,
+                     listensocks_disabled() ? "no" : "yes",
+                     apr_atomic_read32(&connection_count),
+                     listener_may_exit, dying);
 
         for (user_chain = NULL; num > 0; --num, ++out_pfd) {
             listener_poll_type *pt = out_pfd->client_data;
+            socket_callback_baton_t *baton;
 
-            if (pt->type == PT_CSD) {
-                /* one of the sockets is readable */
-                int blocking = 1;
-
-                cs = (event_conn_state_t *) pt->baton;
+            switch (pt->type) {
+            case PT_CSD:
+                /* one of the sockets is ready */
+                cs = (event_conn_state_t *)pt->baton;
                 ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
                               "polled connection %" CS_FMT,
                               CS_ARG(cs));
@@ -2513,12 +2568,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                 case CONN_STATE_ASYNC_WAITIO:
                     cs->pub.state = CONN_STATE_PROCESSING;
                 case CONN_STATE_WRITE_COMPLETION:
-                    break;
-
                 case CONN_STATE_LINGER_NORMAL:
                 case CONN_STATE_LINGER_SHORT:
-                    /* don't wait for a worker for lingering close processing. */
-                    blocking = 0;
                     break;
 
                 default:
@@ -2529,53 +2580,29 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                     ap_assert(0);
                 }
 
-                if (!pollset_del(cs, 0)) {
+                if (pollset_del(cs, 0)) {
+                    push2worker(cs, NULL, now, &workers_were_busy);
+                }
+                else {
                     /* Can't go anywhere, kill (and log) and next. */
                     kill_connection(cs, APR_EGENERAL);
-                    continue;
                 }
+                break;
 
-                {
-                    /* If we don't get a worker immediately (nonblocking), we
-                     * close the connection; the client can re-connect to a
-                     * different process for keepalive, and for lingering close
-                     * the connection will be shutdown so the choice is to favor
-                     * incoming/alive connections.
-                     */
-                    get_worker(&have_idle_worker, blocking,
-                               &workers_were_busy);
-                    if (!have_idle_worker) {
-                        shutdown_connection(cs);
-                    }
-                    else if (push2worker(cs, NULL, NULL) == APR_SUCCESS) {
-                        have_idle_worker = 0;
-                    }
-                }
-            }
-            else if (pt->type == PT_ACCEPT && !listeners_disabled()) {
+            case PT_ACCEPT:
                 /* A Listener Socket is ready for an accept() */
                 if (workers_were_busy) {
-                    disable_listensocks();
-                    ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf,
-                                 APLOGNO(03268)
-                                 "All workers busy, not accepting new conns "
-                                 "in this process");
-                }
-                else if (connections_above_limit(&workers_were_busy)) {
-                    disable_listensocks();
-                    ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf,
-                                 APLOGNO(03269)
-                                 "Too many open connections (%u, idlers %u), "
-                                 "not accepting new conns in this process",
-                                 apr_atomic_read32(&connection_count),
-                                 ap_queue_info_num_idlers(worker_queue_info));
+                    /* Listeners disabled for now, keep the new connection in
+                     * the socket backlog until listening again.
+                     */
+                    continue;
                 }
-                else if (!listener_may_exit) {
+                if (!dying) {
                     void *csd = NULL;
                     ap_listen_rec *lr = (ap_listen_rec *) pt->baton;
                     apr_pool_t *ptrans;         /* Pool for per-transaction stuff */
-                    ap_queue_info_pop_pool(worker_queue_info, &ptrans);
 
+                    ptrans = ap_queue_info_pop_pool(worker_queue_info);
                     if (ptrans == NULL) {
                         /* create a new transaction pool for each accepted socket */
                         apr_allocator_t *allocator = NULL;
@@ -2604,25 +2631,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                         }
                     }
 
-                    get_worker(&have_idle_worker, 1, &workers_were_busy);
                     rc = lr->accept_func(&csd, lr, ptrans);
-
-                    /* later we trash rv and rely on csd to indicate
-                     * success/failure
-                     */
-                    AP_DEBUG_ASSERT(rc == APR_SUCCESS || !csd);
-
-                    if (rc == APR_EGENERAL) {
-                        /* E[NM]FILE, ENOMEM, etc */
-                        resource_shortage = 1;
-                        signal_threads(ST_GRACEFUL);
-                    }
-                    else if (ap_accept_error_is_nonfatal(rc)) {
-                        ap_log_error(APLOG_MARK, APLOG_DEBUG, rc, ap_server_conf,
-                                     "accept() on client socket failed");
-                    }
-
-                    if (csd != NULL) {
+                    if (rc == APR_SUCCESS) {
                         conns_this_child--;
 
                         /* Create and account for the connection from here, or
@@ -2630,40 +2640,45 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                          * would consider it does not exist and could exit the
                          * child too early.
                          */
+                        ap_assert(csd != NULL);
                         cs = make_conn_state(ptrans, csd);
-                        if (push2worker(cs, NULL, NULL) == APR_SUCCESS) {
-                            have_idle_worker = 0;
-                        }
+                        push2worker(cs, NULL, now, &workers_were_busy);
                     }
                     else {
+                        if (rc == APR_EGENERAL) {
+                            /* E[NM]FILE, ENOMEM, etc */
+                            resource_shortage = 1;
+                            signal_threads(ST_GRACEFUL);
+                        }
+                        else if (ap_accept_error_is_nonfatal(rc)) {
+                            ap_log_error(APLOG_MARK, APLOG_DEBUG, rc, ap_server_conf,
+                                         "accept() on client socket failed");
+                        }
                         ap_queue_info_push_pool(worker_queue_info, ptrans);
                     }
                 }
-            }               /* if:else on pt->type */
+                break;
+
 #if HAVE_SERF
-            else if (pt->type == PT_SERF) {
+            case PT_SERF:
                 /* send socket to serf. */
-                /* XXXX: this doesn't require get_worker() */
+                /* XXXX: this doesn't require a worker thread */
                 serf_event_trigger(g_serf, pt->baton, out_pfd);
-            }
-
+                break;
 #endif
-            else if (pt->type == PT_USER) {
-                socket_callback_baton_t *baton = pt->baton;
-                if (baton->cancel_event) {
-                    baton->cancel_event->canceled = 1;
-                }
 
-                /* We only signal once per N sockets with this baton,
-                 * and after this loop to avoid any race/lifetime issue
-                 * with the user callback being called while we handle
-                 * the same baton multiple times here.
+            case PT_USER:
+                /* Multiple pfds of the same baton might trigger in this pass
+                 * so chain once here and run the cleanup only after this loop
+                 * to avoid lifetime issues (i.e. pfds->pool cleared while some
+                 * of its pfd->client_data are still to be dereferenced here).
                  */
-                if (!baton->signaled) {
-                    baton->signaled = 1;
+                baton = pt->baton;
+                if (baton != user_chain && !baton->next) {
                     baton->next = user_chain;
                     user_chain = baton;
                 }
+                break;
             }
         } /* for processing poll */
 
@@ -2673,6 +2688,12 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             user_chain = user_chain->next;
             baton->next = NULL;
 
+            /* Not expirable anymore */
+            if (baton->cancel_event) {
+                baton->cancel_event->canceled = 1;
+                baton->cancel_event = NULL;
+            }
+
             /* remove all sockets from the pollset */
             apr_pool_cleanup_run(baton->pfds->pool, baton->pfds,
                                  event_cleanup_poll_callback);
@@ -2683,7 +2704,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                                  baton->user_baton,
                                  0, /* don't insert it */
                                  NULL /* no associated socket callback */);
-            push_timer2worker(te);
+            push2worker(NULL, te, now, &workers_were_busy);
         }
 
         /* We process the timeout queues here only when the global
@@ -2692,10 +2713,13 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
          * while latest ones are only taken into account here (in listener)
          * during queues' processing, with the lock held. This works both
          * with and without wake-ability.
+         * Even if "now" drifted a bit since it was fetched and the real
+         * "now" went below "expiry" in the meantime, the next poll() will
+         * return immediately so the maintenance will happen then.
          */
         next_expiry = queues_next_expiry;
+        if (next_expiry && next_expiry <= now) {
 do_maintenance:
-        if (next_expiry && next_expiry <= (now = event_time_now())) {
             ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf,
                          "queues maintenance: expired=%" APR_TIME_T_FMT,
                          next_expiry > 0 ? now - next_expiry : -1);
@@ -2705,29 +2729,39 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             /* Recompute this by walking the timeout queues (under the lock) */
             queues_next_expiry = 0;
 
-            /* Step 1: keepalive queue timeouts are closed */
+            /* Step 1: keepalive queue timeouts */
             if (workers_were_busy || dying) {
-                process_keepalive_queue(0); /* kill'em all \m/ */
+                shrink_timeout_queue(keepalive_q, now);
             }
             else {
-                process_keepalive_queue(now);
+                process_timeout_queue(keepalive_q, now);
             }
 
-            /* Step 2: waitio queue timeouts are flushed */
-            process_timeout_queue(waitio_q, now, defer_lingering_close);
+            /* Step 2: waitio queue timeouts */
+            process_timeout_queue(waitio_q, now);
 
-            /* Step 3: write completion queue timeouts are flushed */
-            process_timeout_queue(write_completion_q, now, defer_lingering_close);
+            /* Step 3: write completion queue timeouts */
+            process_timeout_queue(write_completion_q, now);
 
-            /* Step 4: normal lingering close queue timeouts are closed */
+            /* Step 4: normal lingering close queue timeouts */
             if (dying && linger_q->timeout > short_linger_q->timeout) {
                 /* Dying, force short timeout for normal lingering close */
                 linger_q->timeout = short_linger_q->timeout;
             }
-            process_timeout_queue(linger_q, now, shutdown_connection);
+            process_timeout_queue(linger_q, now);
 
-            /* Step 5: short lingering close queue timeouts are closed */
-            process_timeout_queue(short_linger_q, now, shutdown_connection);
+            /* Step 5: short lingering close queue timeouts */
+            process_timeout_queue(short_linger_q, now);
+
+            /* Step 6: backlog queue timeouts
+             * Connections in backlog race with the workers (dequeuing) under
+             * the worker_queue mutex.
+             */
+            if (apr_atomic_read32(backlog_q->total)) {
+                ap_queue_lock(worker_queue);
+                process_timeout_queue(backlog_q, now);
+                ap_queue_unlock(worker_queue);
+            }
 
             next_expiry = queues_next_expiry;
             apr_thread_mutex_unlock(timeout_mutex);
@@ -2740,34 +2774,17 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             ps->write_completion = apr_atomic_read32(write_completion_q->total);
             ps->keep_alive = apr_atomic_read32(keepalive_q->total);
             ps->lingering_close = apr_atomic_read32(&lingering_count);
+            ps->backlog = apr_atomic_read32(backlog_q->total);
             ps->suspended = apr_atomic_read32(&suspended_count);
             ps->connections = apr_atomic_read32(&connection_count);
         }
         else if ((workers_were_busy || dying)
                  && apr_atomic_read32(keepalive_q->total)) {
             apr_thread_mutex_lock(timeout_mutex);
-            process_keepalive_queue(0); /* kill'em all \m/ */
+            shrink_timeout_queue(keepalive_q, now);
             apr_thread_mutex_unlock(timeout_mutex);
             ps->keep_alive = 0;
         }
-
-        /* If there are some lingering closes to defer (to a worker), schedule
-         * them now. We might wakeup a worker spuriously if another one empties
-         * defer_linger_chain in the meantime, but there also may be no active
-         * or all busy workers for an undefined time.  In any case a deferred
-         * lingering close can't starve if we do that here since the chain is
-         * filled only above in the listener and it's emptied only in the
-         * worker(s); thus a NULL here means it will stay so while the listener
-         * waits (possibly indefinitely) in poll().
-         */
-        if (defer_linger_chain) {
-            get_worker(&have_idle_worker, 0, &workers_were_busy);
-            if (have_idle_worker
-                    && defer_linger_chain /* re-test */
-                    && push2worker(NULL, NULL, NULL) == APR_SUCCESS) {
-                have_idle_worker = 0;
-            }
-        }
     } /* listener main loop */
 
     ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
@@ -2822,8 +2839,8 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy)
     int process_slot = ti->pslot;
     int thread_slot = ti->tslot;
     worker_score *ws = &ap_scoreboard_image->servers[process_slot][thread_slot];
+    int is_idler = 0;
     apr_status_t rv;
-    int is_idle = 0;
 
     free(ti);
 
@@ -2834,26 +2851,14 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy)
                                         SERVER_STARTING, NULL);
 
     for (;;) {
-        apr_socket_t *csd = NULL;
-        event_conn_state_t *cs = NULL;
-        timer_event_t *te = NULL;
-        apr_pool_t *ptrans;         /* Pool for per-transaction stuff */
+        ap_queue_event_t *qe;
 
-        if (!is_idle) {
-            rv = ap_queue_info_set_idle(worker_queue_info, NULL);
-            if (rv != APR_SUCCESS) {
-                ap_log_error(APLOG_MARK, APLOG_EMERG, rv, ap_server_conf,
-                             APLOGNO(03270)
-                             "ap_queue_info_set_idle failed. Attempting to "
-                             "shutdown process gracefully.");
-                signal_threads(ST_GRACEFUL);
-                break;
-            }
+        if (!is_idler) {
+            int idlers = ap_queue_info_idlers_inc(worker_queue_info);
             ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf,
                          "worker thread %i/%i idle (idlers %i)",
-                         thread_slot, threads_per_child,
-                         ap_queue_info_num_idlers(worker_queue_info));
-            is_idle = 1;
+                         thread_slot, threads_per_child, idlers);
+            is_idler = 1;
 
             /* If the listening sockets are paused and this new idler switches
              * connections_above_limit() back, let the listener know and poll
@@ -2879,9 +2884,7 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy)
             break;
         }
 
-        rv = ap_queue_pop_something(worker_queue, &csd, (void **)&cs,
-                                    &ptrans, &te);
-
+        rv = ap_queue_pop_event(worker_queue, &qe);
         if (rv != APR_SUCCESS) {
             /* We get APR_EOF during a graceful shutdown once all the
              * connections accepted by this server process have been handled.
@@ -2893,12 +2896,12 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy)
                 break;
             }
 
-            /* We get APR_EINTR whenever ap_queue_pop_*() has been interrupted
-             * from an explicit call to ap_queue_interrupt_all(). This allows
-             * us to unblock threads stuck in ap_queue_pop_*() when a shutdown
-             * is pending.
+            /* We get APR_EINTR whenever ap_queue_pop_event() has been
+             * interrupted from an explicit call to ap_queue_interrupt_*().
+             * This allows us to unblock threads stuck in ap_queue_pop_event()
+             * when a shutdown is pending.
              *
-             * If workers_may_exit is set and this is ungraceful termination/
+             * If workers_may_exit is set and this is ungraceful stop or
              * restart, we are bound to get an error on some systems (e.g.,
              * AIX, which sanity-checks mutex operations) since the queue
              * may have already been cleaned up.  Don't log the "error" if
@@ -2906,59 +2909,60 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy)
              */
             if (!APR_STATUS_IS_EINTR(rv) && !workers_may_exit) {
                 ap_log_error(APLOG_MARK, APLOG_CRIT, rv, ap_server_conf,
-                             APLOGNO(03099) "ap_queue_pop_something failed");
+                             APLOGNO(03099) "ap_queue_pop_event failed");
                 AP_DEBUG_ASSERT(0);
                 signal_threads(ST_GRACEFUL);
             }
             continue;
         }
 
+        is_idler = 0; /* event consumed */
         ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf,
                      "worker thread %i/%i busy (idlers %i)",
                      thread_slot, threads_per_child,
-                     ap_queue_info_num_idlers(worker_queue_info));
+                     ap_queue_info_idlers_count(worker_queue_info));
+
+        if (qe->type == AP_QUEUE_EVENT_SOCK) {
+            apr_pool_t *p;
+            apr_socket_t *csd;
+            event_conn_state_t *cs;
+
+            ap_assert(qe->data.se);
+            p = qe->data.se->p;
+            csd = qe->data.se->sd;
+            cs = qe->data.se->baton;
+            ap_assert(p && csd && cs && qe == cs_qe(cs));
+
+            worker_sockets[thread_slot] = csd;
+            process_socket(thd, p, csd, cs, process_slot, thread_slot);
+            worker_sockets[thread_slot] = NULL;
+        }
+        else if (qe->type == AP_QUEUE_EVENT_TIMER) {
+            timer_event_t *te;
+            ap_mpm_callback_fn_t *cbfunc;
+            void *baton;
+
+            te = qe->data.te;
+            ap_assert(te && qe == te_qe(te));
+
+            cbfunc = te->cbfunc;
+            baton = te->baton;
 
-        if (te != NULL) {
-            void *baton = te->baton;
-            ap_mpm_callback_fn_t *cbfunc = te->cbfunc;
             /* first recycle the timer event */
             put_timer_event(te, 0);
+
+            ap_update_child_status_from_indexes(process_slot, thread_slot,
+                                                SERVER_BUSY_WRITE, NULL);
+            ap_assert(cbfunc != NULL);
             cbfunc(baton);
         }
         else {
-            is_idle = 0; /* consumed */
-            if (csd != NULL) {
-                worker_sockets[thread_slot] = csd;
-                process_socket(thd, ptrans, csd, cs, process_slot, thread_slot);
-                worker_sockets[thread_slot] = NULL;
-            }
-        }
-
-        /* If there are deferred lingering closes, handle them now. */
-        while (!workers_may_exit) {
-            cs = defer_linger_chain;
-            if (!cs) {
-                break;
-            }
-            if (apr_atomic_casptr((void *)&defer_linger_chain, cs->chain,
-                                  cs) != cs) {
-                /* Race lost, try again */
-                continue;
-            }
-            cs->chain = NULL;
-            AP_DEBUG_ASSERT(cs->pub.state == CONN_STATE_LINGER);
-
-            ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
-                          "deferred close for connection %" CS_FMT, CS_ARG(cs));
-
-            worker_sockets[thread_slot] = csd = cs_sd(cs);
-            process_socket(thd, cs->p, csd, cs, process_slot, thread_slot);
-            worker_sockets[thread_slot] = NULL;
+            ap_assert(0);
         }
     }
-    if (is_idle) {
+    if (is_idler) {
         /* Not idling anymore */
-        ap_queue_info_wait_for_idler(worker_queue_info, NULL);
+        ap_queue_info_idlers_dec(worker_queue_info);
     }
 
     ap_update_child_status_from_indexes(process_slot, thread_slot,
@@ -3011,10 +3015,10 @@ static void setup_threads_runtime(void)
                                  APR_POLLSET_KQUEUE,
                                  APR_POLLSET_EPOLL };
     /* XXX: K-A or lingering close connection included in the async factor */
-    const unsigned int threads_factor = worker_factor / WORKER_FACTOR_SCALE;
-    const apr_size_t pollset_size = ((unsigned int)num_listensocks +
-                                     (unsigned int)threads_per_child *
-                                     (threads_factor > 2 ? threads_factor : 2));
+    unsigned int async_factor = (worker_factor < WORKER_FACTOR_SCALE * 2
+                                 ? WORKER_FACTOR_SCALE * 2 : worker_factor);
+    unsigned int async_threads = (threads_per_child * async_factor / WORKER_FACTOR_SCALE);
+    const apr_size_t pollset_size = (num_listensocks + async_threads + POLLSET_RESERVE_SIZE);
     int pollset_flags;
 
     /* Event's skiplist operations will happen concurrently with other modules'
@@ -3046,8 +3050,8 @@ static void setup_threads_runtime(void)
     apr_pool_tag(pruntime, "mpm_runtime");
 
     /* We must create the fd queues before we start up the listener
-     * and worker threads. */
-    rv = ap_queue_create(&worker_queue, threads_per_child, pruntime);
+     * and worker threads, it's bounded by connections_above_limit(). */
+    rv = ap_queue_create(&worker_queue, -1, pruntime);
     if (rv != APR_SUCCESS) {
         ap_log_error(APLOG_MARK, APLOG_ALERT, rv, ap_server_conf, APLOGNO(03100)
                      "ap_queue_create() failed");
@@ -3061,8 +3065,7 @@ static void setup_threads_runtime(void)
          */
         max_recycled_pools = threads_per_child * 3 / 4 ;
     }
-    rv = ap_queue_info_create(&worker_queue_info, pruntime,
-                              threads_per_child, max_recycled_pools);
+    rv = ap_queue_info_create(&worker_queue_info, pruntime, max_recycled_pools);
     if (rv != APR_SUCCESS) {
         ap_log_error(APLOG_MARK, APLOG_ALERT, rv, ap_server_conf, APLOGNO(03101)
                      "ap_queue_info_create() failed");
@@ -3639,6 +3642,7 @@ static void perform_idle_server_maintenance(void)
     int max_daemon_used = 0;
     int idle_thread_count = 0;
     int active_thread_count = 0;
+    int backlog_count = 0;
     int i, j;
 
     for (i = 0; i < server_limit; ++i) {
@@ -3682,6 +3686,7 @@ static void perform_idle_server_maintenance(void)
                 }
             }
             active_thread_count += child_threads_active;
+            backlog_count += apr_atomic_read32(&ps->backlog);
             if (child_threads_active == threads_per_child) {
                 had_healthy_child = 1;
             }
@@ -3855,10 +3860,10 @@ static void perform_idle_server_maintenance(void)
     retained->max_daemon_used = max_daemon_used;
     if (APLOGdebug(ap_server_conf)) {
         ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf,
-                     "score: idlers:%d, "
+                     "score: idlers:%d backlog:%d, "
                      "threads active:%d/%d max:%d, "
                      "daemons active:%d/%d max:%d used:%d/%d/%d",
-                     idle_thread_count,
+                     idle_thread_count, backlog_count,
                      active_thread_count, retained->active_daemons * threads_per_child,
                      max_workers, retained->active_daemons, retained->total_daemons,
                      active_daemons_limit, max_daemon_used, retained->max_daemon_used,
@@ -4425,14 +4430,12 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog,
     active_daemons_limit = server_limit;
     threads_per_child = DEFAULT_THREADS_PER_CHILD;
     max_workers = active_daemons_limit * threads_per_child;
-    defer_linger_chain = NULL;
     had_healthy_child = 0;
     ap_extended_status = 0;
 
     event_pollset = NULL;
     worker_queue_info = NULL;
     listener_os_thread = NULL;
-    listensocks_disabled = 0;
     listener_is_wakeable = 0;
 
     return OK;
@@ -4441,7 +4444,7 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog,
 static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog,
                              apr_pool_t *ptemp, server_rec *s)
 {
-    apr_hash_t *io_h, *wc_h, *ka_h;
+    apr_hash_t *io_h, *wc_h, *ka_h, *bl_h;
 
     /* Not needed in pre_config stage */
     if (ap_state_query(AP_SQ_MAIN_STATE) == AP_SQ_MS_CREATE_PRE_CONFIG) {
@@ -4451,6 +4454,7 @@ static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog,
     io_h = apr_hash_make(ptemp);
     wc_h = apr_hash_make(ptemp);
     ka_h = apr_hash_make(ptemp);
+    bl_h = apr_hash_make(ptemp);
 
     linger_q = TO_QUEUE_MAKE(pconf, "linger",
                              apr_time_from_sec(MAX_SECS_TO_LINGER), NULL);
@@ -4470,6 +4474,9 @@ static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog,
 
         sc->ka_q = TO_QUEUE_CHAIN(pconf, "keepalive", s->keep_alive_timeout,
                                   &keepalive_q, ka_h, ptemp);
+
+        sc->bl_q = TO_QUEUE_CHAIN(pconf, "backlog", s->timeout,
+                                       &backlog_q, bl_h, ptemp);
     }
 
     return OK;
diff --git a/server/mpm/worker/worker.c b/server/mpm/worker/worker.c
index 42b81a8ed1b..1fff5b085e6 100644
--- a/server/mpm/worker/worker.c
+++ b/server/mpm/worker/worker.c
@@ -583,7 +583,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t *thd, void * dummy)
         if (listener_may_exit) break;
 
         if (!have_idle_worker) {
-            rv = ap_queue_info_wait_for_idler(worker_queue_info, NULL);
+            rv = ap_queue_info_wait_for_idler(worker_queue_info);
             if (APR_STATUS_IS_EOF(rv)) {
                 break; /* we've been signaled to die now */
             }
@@ -662,7 +662,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t *thd, void * dummy)
 
         if (!listener_may_exit) {
             /* the following pops a recycled ptrans pool off a stack */
-            ap_queue_info_pop_pool(worker_queue_info, &ptrans);
+            ptrans = ap_queue_info_pop_pool(worker_queue_info);
             if (ptrans == NULL) {
                 /* we can't use a recycled transaction pool this time.
                  * create a new transaction pool */
@@ -696,7 +696,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t *thd, void * dummy)
                 accept_mutex_error("unlock", rv, process_slot);
             }
             if (csd != NULL) {
-                rv = ap_queue_push_socket(worker_queue, csd, NULL, ptrans);
+                rv = ap_queue_push_socket(worker_queue, csd, ptrans);
                 if (rv) {
                     /* trash the connection; we couldn't queue the connected
                      * socket to a worker
@@ -901,8 +901,7 @@ static void setup_threads_runtime(void)
         clean_child_exit(APEXIT_CHILDFATAL);
     }
 
-    rv = ap_queue_info_create(&worker_queue_info, pruntime,
-                              threads_per_child, -1);
+    rv = ap_queue_info_create(&worker_queue_info, pruntime, -1);
     if (rv != APR_SUCCESS) {
         ap_log_error(APLOG_MARK, APLOG_ALERT, rv, ap_server_conf, APLOGNO(03141)
                      "ap_queue_info_create() failed");
diff --git a/server/mpm_fdqueue.c b/server/mpm_fdqueue.c
index 3697ca722f6..7871597d910 100644
--- a/server/mpm_fdqueue.c
+++ b/server/mpm_fdqueue.c
@@ -20,7 +20,23 @@
 
 #include <apr_atomic.h>
 
-static const apr_uint32_t zero_pt = APR_UINT32_MAX/2;
+#define ZERO_PT (APR_UINT32_MAX / 2)
+
+APR_RING_HEAD(fd_queue_ring, fd_queue_elem_t);
+
+struct fd_queue_t
+{
+    struct fd_queue_ring elts;
+    apr_uint32_t nelts;
+    apr_uint32_t bounds;
+    apr_pool_t *spare_pool;
+    fd_queue_elem_t *spare_elems;
+    apr_thread_mutex_t *one_big_mutex;
+    apr_thread_cond_t *not_empty;
+    apr_uint32_t num_waiters;
+    apr_uint32_t interrupted;
+    apr_uint32_t terminated;
+};
 
 struct recycled_pool
 {
@@ -30,59 +46,43 @@ struct recycled_pool
 
 struct fd_queue_info_t
 {
-    apr_uint32_t volatile idlers; /**
-                                   * >= zero_pt: number of idle worker threads
-                                   * <  zero_pt: number of threads blocked,
-                                   *             waiting for an idle worker
-                                   */
+    apr_uint32_t volatile idlers; /* >= ZERO_PT: number of idle worker threads
+                                   *  < ZERO_PT: number of events in backlog
+                                   *             (waiting for an idle thread) */
     apr_thread_mutex_t *idlers_mutex;
     apr_thread_cond_t *wait_for_idler;
-    int terminated;
-    int max_idlers;
-    int max_recycled_pools;
-    apr_uint32_t recycled_pools_count;
+    apr_uint32_t max_idlers;
+    apr_uint32_t terminated;
     struct recycled_pool *volatile recycled_pools;
+    apr_uint32_t recycled_pools_count;
+    apr_uint32_t max_recycled_pools;
 };
 
 struct fd_queue_elem_t
 {
-    apr_socket_t *sd;
-    void *sd_baton;
-    apr_pool_t *p;
+    APR_RING_ENTRY(fd_queue_elem_t) link; /* in ring */
+    struct fd_queue_elem_t *next; /* in spare list */
+    sock_event_t self_sock_event;
+    ap_queue_event_t self_event;
+    ap_queue_event_t *event;
 };
 
-static apr_status_t queue_info_cleanup(void *data_)
+static apr_status_t queue_info_cleanup(void *qi)
 {
-    fd_queue_info_t *qi = data_;
-    apr_thread_cond_destroy(qi->wait_for_idler);
-    apr_thread_mutex_destroy(qi->idlers_mutex);
-
-    /* Clean up any pools in the recycled list */
-    for (;;) {
-        struct recycled_pool *first_pool = qi->recycled_pools;
-        if (first_pool == NULL) {
-            break;
-        }
-        if (apr_atomic_casptr((void *)&qi->recycled_pools, first_pool->next,
-                              first_pool) == first_pool) {
-            apr_pool_destroy(first_pool->pool);
-        }
-    }
-
+    /* Clean up all pools in the recycled list */
+    ap_queue_info_free_idle_pools(qi);
     return APR_SUCCESS;
 }
 
-apr_status_t ap_queue_info_create(fd_queue_info_t **queue_info,
-                                  apr_pool_t *pool, int max_idlers,
-                                  int max_recycled_pools)
+AP_DECLARE(apr_status_t) ap_queue_info_create(fd_queue_info_t **queue_info,
+                                              apr_pool_t *pool, int max_recycled_pools)
 {
     apr_status_t rv;
     fd_queue_info_t *qi;
 
     qi = apr_pcalloc(pool, sizeof(*qi));
 
-    rv = apr_thread_mutex_create(&qi->idlers_mutex, APR_THREAD_MUTEX_DEFAULT,
-                                 pool);
+    rv = apr_thread_mutex_create(&qi->idlers_mutex, APR_THREAD_MUTEX_DEFAULT, pool);
     if (rv != APR_SUCCESS) {
         return rv;
     }
@@ -90,27 +90,30 @@ apr_status_t ap_queue_info_create(fd_queue_info_t **queue_info,
     if (rv != APR_SUCCESS) {
         return rv;
     }
-    qi->recycled_pools = NULL;
-    qi->max_recycled_pools = max_recycled_pools;
-    qi->max_idlers = max_idlers;
-    qi->idlers = zero_pt;
+    qi->idlers = ZERO_PT;
+    if (max_recycled_pools >= 0) {
+        qi->max_recycled_pools = max_recycled_pools;
+    }
+    else {
+        qi->max_recycled_pools = APR_INT32_MAX;
+    }
+
     apr_pool_cleanup_register(pool, qi, queue_info_cleanup,
                               apr_pool_cleanup_null);
 
     *queue_info = qi;
-
     return APR_SUCCESS;
 }
 
-apr_status_t ap_queue_info_set_idle(fd_queue_info_t *queue_info,
-                                    apr_pool_t *pool_to_recycle)
+AP_DECLARE(apr_status_t) ap_queue_info_set_idle(fd_queue_info_t *queue_info,
+                                                apr_pool_t *pool_to_recycle)
 {
     apr_status_t rv;
 
     ap_queue_info_push_pool(queue_info, pool_to_recycle);
 
     /* If other threads are waiting on a worker, wake one up */
-    if (apr_atomic_inc32(&queue_info->idlers) < zero_pt) {
+    if (apr_atomic_inc32(&queue_info->idlers) < ZERO_PT) {
         rv = apr_thread_mutex_lock(queue_info->idlers_mutex);
         if (rv != APR_SUCCESS) {
             AP_DEBUG_ASSERT(0);
@@ -130,23 +133,25 @@ apr_status_t ap_queue_info_set_idle(fd_queue_info_t *queue_info,
     return APR_SUCCESS;
 }
 
-apr_status_t ap_queue_info_try_get_idler(fd_queue_info_t *queue_info)
+AP_DECLARE(apr_status_t) ap_queue_info_try_get_idler(fd_queue_info_t *queue_info)
 {
     /* Don't block if there isn't any idle worker. */
+    apr_uint32_t idlers = queue_info->idlers, val;
     for (;;) {
-        apr_uint32_t idlers = queue_info->idlers;
-        if (idlers <= zero_pt) {
+        if (idlers <= ZERO_PT) {
             return APR_EAGAIN;
         }
-        if (apr_atomic_cas32(&queue_info->idlers, idlers - 1,
-                             idlers) == idlers) {
+
+        val = apr_atomic_cas32(&queue_info->idlers, idlers - 1, idlers);
+        if (val == idlers) {
             return APR_SUCCESS;
         }
+
+        idlers = val;
     }
 }
 
-apr_status_t ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info,
-                                          int *had_to_block)
+AP_DECLARE(apr_status_t) ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info)
 {
     apr_status_t rv;
 
@@ -154,7 +159,7 @@ apr_status_t ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info,
      * apr_atomic_add32(x, -1) does the same as dec32(x), except
      * that it returns the previous value (unlike dec32's bool).
      */
-    if (apr_atomic_add32(&queue_info->idlers, -1) <= zero_pt) {
+    if (apr_atomic_add32(&queue_info->idlers, -1) <= ZERO_PT) {
         rv = apr_thread_mutex_lock(queue_info->idlers_mutex);
         if (rv != APR_SUCCESS) {
             AP_DEBUG_ASSERT(0);
@@ -177,13 +182,14 @@ apr_status_t ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info,
          *     now non-negative, it's safe for this function to
          *     return immediately.
          *
-         *     A "negative value" (relative to zero_pt) in
+         *     A "negative value" (relative to ZERO_PT) in
          *     queue_info->idlers tells how many
          *     threads are waiting on an idle worker.
          */
-        if (queue_info->idlers < zero_pt) {
-            if (had_to_block) {
-                *had_to_block = 1;
+        if (apr_atomic_read32(&queue_info->idlers) < ZERO_PT) {
+            if (queue_info->terminated) {
+                apr_thread_mutex_unlock(queue_info->idlers_mutex);
+                return APR_EOF;
             }
             rv = apr_thread_cond_wait(queue_info->wait_for_idler,
                                       queue_info->idlers_mutex);
@@ -199,7 +205,7 @@ apr_status_t ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info,
         }
     }
 
-    if (queue_info->terminated) {
+    if (apr_atomic_read32(&queue_info->terminated)) {
         return APR_EOF;
     }
     else {
@@ -207,52 +213,75 @@ apr_status_t ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info,
     }
 }
 
-apr_uint32_t ap_queue_info_num_idlers(fd_queue_info_t *queue_info)
+AP_DECLARE(apr_uint32_t) ap_queue_info_num_idlers(fd_queue_info_t *queue_info)
 {
-    apr_uint32_t val;
-    val = apr_atomic_read32(&queue_info->idlers);
-    return (val > zero_pt) ? val - zero_pt : 0;
+    apr_uint32_t val = apr_atomic_read32(&queue_info->idlers);
+    return (val > ZERO_PT) ? val - ZERO_PT : 0;
 }
 
-void ap_queue_info_push_pool(fd_queue_info_t *queue_info,
-                             apr_pool_t *pool_to_recycle)
+AP_DECLARE(apr_int32_t) ap_queue_info_idlers_count(fd_queue_info_t *queue_info)
 {
-    struct recycled_pool *new_recycle;
+    return apr_atomic_read32(&queue_info->idlers) - ZERO_PT;
+}
+
+AP_DECLARE(apr_int32_t) ap_queue_info_idlers_inc(fd_queue_info_t *queue_info)
+{
+     /* apr_atomic_add32() returns the previous value, we return the new one */
+    return apr_atomic_add32(&queue_info->idlers, +1) + 1 - ZERO_PT;
+}
+
+AP_DECLARE(apr_int32_t) ap_queue_info_idlers_dec(fd_queue_info_t *queue_info)
+{
+     /* apr_atomic_add32() returns the previous value, we return the new one */
+    return apr_atomic_add32(&queue_info->idlers, -1) - 1 - ZERO_PT;
+}
+
+AP_DECLARE(void) ap_queue_info_push_pool(fd_queue_info_t *queue_info,
+                                         apr_pool_t *pool_to_recycle)
+{
+    struct recycled_pool *new_recycle, *first_pool, *val;
+    apr_uint32_t count;
+
     /* If we have been given a pool to recycle, atomically link
      * it into the queue_info's list of recycled pools
      */
     if (!pool_to_recycle)
         return;
 
-    if (queue_info->max_recycled_pools >= 0) {
-        apr_uint32_t n = apr_atomic_read32(&queue_info->recycled_pools_count);
-        if (n >= queue_info->max_recycled_pools) {
-            apr_pool_destroy(pool_to_recycle);
-            return;
-        }
-        apr_atomic_inc32(&queue_info->recycled_pools_count);
+    /* The counting is racy but we don't mind recycling a few more/less pools,
+     * it's lighter than a compare & swap loop or an inc + dec to back out.
+     */
+    count = apr_atomic_read32(&queue_info->recycled_pools_count);
+    if (count >= queue_info->max_recycled_pools) {
+        apr_pool_destroy(pool_to_recycle);
+        return;
     }
+    apr_atomic_inc32(&queue_info->recycled_pools_count);
 
     apr_pool_clear(pool_to_recycle);
     new_recycle = apr_palloc(pool_to_recycle, sizeof *new_recycle);
     new_recycle->pool = pool_to_recycle;
+
+    first_pool = queue_info->recycled_pools;
     for (;;) {
-        /*
-         * Save queue_info->recycled_pool in local variable next because
-         * new_recycle->next can be changed after apr_atomic_casptr
-         * function call. For gory details see PR 44402.
+        new_recycle->next = first_pool;
+        val = apr_atomic_casptr((void *)&queue_info->recycled_pools,
+                                new_recycle, first_pool);
+        /* Don't compare with new_recycle->next because it can change
+         * after apr_atomic_casptr(). For gory details see PR 44402.
          */
-        struct recycled_pool *next = queue_info->recycled_pools;
-        new_recycle->next = next;
-        if (apr_atomic_casptr((void *)&queue_info->recycled_pools,
-                              new_recycle, next) == next)
-            break;
+        if (val == first_pool) {
+            return;
+        }
+
+        first_pool = val;
     }
 }
 
-void ap_queue_info_pop_pool(fd_queue_info_t *queue_info,
-                            apr_pool_t **recycled_pool)
+AP_DECLARE(apr_pool_t *) ap_queue_info_pop_pool(fd_queue_info_t *queue_info)
 {
+    struct recycled_pool *first_pool, *val;
+
     /* Atomically pop a pool from the recycled list */
 
     /* This function is safe only as long as it is single threaded because
@@ -262,41 +291,43 @@ void ap_queue_info_pop_pool(fd_queue_info_t *queue_info,
      * happen concurrently with a single cas-based pop.
      */
 
-    *recycled_pool = NULL;
-
-
-    /* Atomically pop a pool from the recycled list */
+    first_pool = queue_info->recycled_pools;
     for (;;) {
-        struct recycled_pool *first_pool = queue_info->recycled_pools;
         if (first_pool == NULL) {
-            break;
+            return NULL;
         }
-        if (apr_atomic_casptr((void *)&queue_info->recycled_pools,
-                              first_pool->next, first_pool) == first_pool) {
-            *recycled_pool = first_pool->pool;
-            if (queue_info->max_recycled_pools >= 0)
-                apr_atomic_dec32(&queue_info->recycled_pools_count);
-            break;
+
+        val = apr_atomic_casptr((void *)&queue_info->recycled_pools,
+                                first_pool->next, first_pool);
+        if (val == first_pool) {
+            apr_atomic_dec32(&queue_info->recycled_pools_count);
+            return first_pool->pool;
         }
+
+        first_pool = val;
     }
 }
 
-void ap_queue_info_free_idle_pools(fd_queue_info_t *queue_info)
+AP_DECLARE(void) ap_queue_info_free_idle_pools(fd_queue_info_t *queue_info)
 {
     apr_pool_t *p;
 
-    queue_info->max_recycled_pools = 0;
+    /* Atomically free the recycled list */
+
+    /* Per ap_queue_info_pop_pool() should not be called concurrently, but
+     * it's only from the listener thread for now.
+     */
+
     for (;;) {
-        ap_queue_info_pop_pool(queue_info, &p);
+        p = ap_queue_info_pop_pool(queue_info);
         if (p == NULL)
-            break;
+            return;
         apr_pool_destroy(p);
     }
-    apr_atomic_set32(&queue_info->recycled_pools_count, 0);
 }
 
 
-apr_status_t ap_queue_info_term(fd_queue_info_t *queue_info)
+AP_DECLARE(apr_status_t) ap_queue_info_term(fd_queue_info_t *queue_info)
 {
     apr_status_t rv;
 
@@ -305,47 +336,35 @@ apr_status_t ap_queue_info_term(fd_queue_info_t *queue_info)
         return rv;
     }
 
-    queue_info->terminated = 1;
+    apr_atomic_set32(&queue_info->terminated, 1);
     apr_thread_cond_broadcast(queue_info->wait_for_idler);
 
     return apr_thread_mutex_unlock(queue_info->idlers_mutex);
 }
 
-/**
+/*
+ * Lock/unlock the fd_queue_t.
+ */
+#define queue_lock(q)   apr_thread_mutex_lock((q)->one_big_mutex)
+#define queue_unlock(q) apr_thread_mutex_unlock((q)->one_big_mutex)
+
+/*
  * Detects when the fd_queue_t is full. This utility function is expected
  * to be called from within critical sections, and is not threadsafe.
  */
-#define ap_queue_full(queue) ((queue)->nelts == (queue)->bounds)
+#define queue_full(q) ((q)->nelts == (q)->bounds)
 
-/**
+/*
  * Detects when the fd_queue_t is empty. This utility function is expected
  * to be called from within critical sections, and is not threadsafe.
  */
-#define ap_queue_empty(queue) ((queue)->nelts == 0 && \
-                               APR_RING_EMPTY(&queue->timers, \
-                                              timer_event_t, link))
+#define queue_empty(q) ((q)->nelts == 0)
 
-/**
- * Callback routine that is called to destroy this
- * fd_queue_t when its pool is destroyed.
- */
-static apr_status_t ap_queue_destroy(void *data)
-{
-    fd_queue_t *queue = data;
-
-    /* Ignore errors here, we can't do anything about them anyway.
-     * XXX: We should at least try to signal an error here, it is
-     * indicative of a programmer error. -aaron */
-    apr_thread_cond_destroy(queue->not_empty);
-    apr_thread_mutex_destroy(queue->one_big_mutex);
-
-    return APR_SUCCESS;
-}
-
-/**
+/*
  * Initialize the fd_queue_t.
  */
-apr_status_t ap_queue_create(fd_queue_t **pqueue, int capacity, apr_pool_t *p)
+AP_DECLARE(apr_status_t) ap_queue_create(fd_queue_t **pqueue, int capacity,
+                                         apr_pool_t *p)
 {
     apr_status_t rv;
     fd_queue_t *queue;
@@ -361,143 +380,264 @@ apr_status_t ap_queue_create(fd_queue_t **pqueue, int capacity, apr_pool_t *p)
         return rv;
     }
 
-    APR_RING_INIT(&queue->timers, timer_event_t, link);
-
-    queue->data = apr_pcalloc(p, capacity * sizeof(fd_queue_elem_t));
-    queue->bounds = capacity;
+    apr_pool_create(&queue->spare_pool, p);
+    APR_RING_INIT(&queue->elts, fd_queue_elem_t, link);
+    if (capacity > 0) {
+        queue->bounds = capacity;
+    }
+    else {
+        queue->bounds = APR_UINT32_MAX;
+    }
 
-    apr_pool_cleanup_register(p, queue, ap_queue_destroy,
-                              apr_pool_cleanup_null);
     *pqueue = queue;
-
     return APR_SUCCESS;
 }
 
-/**
- * Push a new socket onto the queue.
- *
- * precondition: ap_queue_info_wait_for_idler has already been called
- *               to reserve an idle worker thread
- */
-apr_status_t ap_queue_push_socket(fd_queue_t *queue,
-                                  apr_socket_t *sd, void *sd_baton,
-                                  apr_pool_t *p)
+static APR_INLINE fd_queue_elem_t *get_spare_elem(fd_queue_t *queue)
+{
+    fd_queue_elem_t *elem = queue->spare_elems;
+    if (elem == NULL) {
+        elem = apr_pcalloc(queue->spare_pool, sizeof(*elem));
+    }
+    else {
+        queue->spare_elems = elem->next;
+        elem->next = NULL;
+    }
+    return elem;
+}
+
+static APR_INLINE void put_spare_elem(fd_queue_t *queue, fd_queue_elem_t *elem)
+{
+    elem->event = NULL;
+    elem->next = queue->spare_elems;
+    queue->spare_elems = elem;
+}
+
+static APR_INLINE void enqueue_elem(fd_queue_t *queue, fd_queue_elem_t *elem,
+                                    ap_queue_event_t *event)
+{
+    if (event) {
+        elem->event = event;
+    }
+    else {
+        elem->event = &elem->self_event;
+    }
+    elem->event->elem = elem;
+
+    APR_RING_INSERT_TAIL(&queue->elts, elem, fd_queue_elem_t, link);
+    queue->nelts++;
+}
+
+static APR_INLINE void dequeue_elem(fd_queue_t *queue, fd_queue_elem_t *elem)
+{
+    elem->event->elem = NULL;
+    ap_assert(queue->nelts > 0);
+    APR_RING_REMOVE(elem, link);
+    APR_RING_ELEM_INIT(elem, link);
+    queue->nelts--;
+}
+
+/* Pushes the last available element to the queue. */
+static void push_elem(fd_queue_t *queue, fd_queue_elem_t **pushed_elem,
+                      ap_queue_event_t *event)
 {
     fd_queue_elem_t *elem;
+
+    AP_DEBUG_ASSERT(!queue_full(queue));
+    AP_DEBUG_ASSERT(!queue->terminated);
+
+    elem = get_spare_elem(queue);
+    enqueue_elem(queue, elem, event);
+
+    if (pushed_elem) {
+        *pushed_elem = elem;
+    }
+}
+
+/*
+ * Retrieves the oldest available element from the queue, waiting until one
+ * becomes available.
+ */
+static apr_status_t pop_elem(fd_queue_t *queue, fd_queue_elem_t **pelem)
+{
     apr_status_t rv;
 
-    if ((rv = apr_thread_mutex_lock(queue->one_big_mutex)) != APR_SUCCESS) {
+    for (;;) {
+        if (queue->terminated) {
+            return APR_EOF; /* no more elements ever again */
+        }
+
+        if (queue->interrupted) {
+            queue->interrupted--;
+            return APR_EINTR;
+        }
+
+        if (!queue_empty(queue)) {
+            *pelem = APR_RING_FIRST(&queue->elts);
+            dequeue_elem(queue, *pelem);
+            return APR_SUCCESS;
+        }
+
+        queue->num_waiters++;
+        rv = apr_thread_cond_wait(queue->not_empty, queue->one_big_mutex);
+        queue->num_waiters--;
+        if (rv != APR_SUCCESS) {
+            return rv;
+        }
+    }
+}
+
+AP_DECLARE(apr_status_t) ap_queue_push_event(fd_queue_t *queue,
+                                             ap_queue_event_t *event)
+{
+    apr_status_t rv;
+
+    if ((rv = queue_lock(queue)) != APR_SUCCESS) {
         return rv;
     }
 
-    AP_DEBUG_ASSERT(!queue->terminated);
-    AP_DEBUG_ASSERT(!ap_queue_full(queue));
-
-    elem = &queue->data[queue->in++];
-    if (queue->in >= queue->bounds)
-        queue->in -= queue->bounds;
-    elem->sd = sd;
-    elem->sd_baton = sd_baton;
-    elem->p = p;
-    queue->nelts++;
+    switch (event->type) {
+    case AP_QUEUE_EVENT_SOCK:
+    case AP_QUEUE_EVENT_TIMER:
+    case AP_QUEUE_EVENT_BATON:
+        push_elem(queue, NULL, event);
+        if (event->cb) {
+            event->cb(event->cb_baton, 1);
+        }
+        apr_thread_cond_signal(queue->not_empty);
+        break;
 
-    apr_thread_cond_signal(queue->not_empty);
+    default:
+        rv = APR_EINVAL;
+        break;
+    }
 
-    return apr_thread_mutex_unlock(queue->one_big_mutex);
+    queue_unlock(queue);
+    return rv;
 }
 
-apr_status_t ap_queue_push_timer(fd_queue_t *queue, timer_event_t *te)
+AP_DECLARE(apr_status_t) ap_queue_pop_event(fd_queue_t *queue,
+                                            ap_queue_event_t **pevent)
 {
     apr_status_t rv;
+    fd_queue_elem_t *elem;
+
+    *pevent = NULL;
 
-    if ((rv = apr_thread_mutex_lock(queue->one_big_mutex)) != APR_SUCCESS) {
+    if ((rv = queue_lock(queue)) != APR_SUCCESS) {
         return rv;
     }
 
-    AP_DEBUG_ASSERT(!queue->terminated);
+    rv = pop_elem(queue, &elem);
+    if (rv == APR_SUCCESS) {
+        ap_queue_event_t *event = elem->event;
+        ap_assert(event && event != &elem->self_event);
+        put_spare_elem(queue, elem);
+        if (event->cb) {
+            event->cb(event->cb_baton, 0);
+        }
+        *pevent = event;
+    }
 
-    APR_RING_INSERT_TAIL(&queue->timers, te, timer_event_t, link);
+    queue_unlock(queue);
+    return rv;
+}
 
-    apr_thread_cond_signal(queue->not_empty);
+AP_DECLARE(void) ap_queue_kill_event_locked(fd_queue_t *queue,
+                                            ap_queue_event_t *event)
+{
+    fd_queue_elem_t *elem = event->elem;
+    ap_assert(elem && APR_RING_NEXT(elem, link) != elem);
 
-    return apr_thread_mutex_unlock(queue->one_big_mutex);
+    dequeue_elem(queue, elem);
+    put_spare_elem(queue, elem);
+    if (event->cb) {
+        event->cb(event->cb_baton, 0);
+    }
+}
+
+AP_DECLARE(apr_status_t) ap_queue_lock(fd_queue_t *queue)
+{
+    return queue_lock(queue);
+}
+
+AP_DECLARE(apr_status_t) ap_queue_unlock(fd_queue_t *queue)
+{
+    return queue_unlock(queue);
 }
 
 /**
- * Retrieves the next available socket from the queue. If there are no
- * sockets available, it will block until one becomes available.
- * Once retrieved, the socket is placed into the address specified by
- * 'sd'.
+ * Push a socket onto the queue.
  */
-apr_status_t ap_queue_pop_something(fd_queue_t *queue,
-                                    apr_socket_t **sd, void **sd_baton,
-                                    apr_pool_t **p, timer_event_t **te_out)
+AP_DECLARE(apr_status_t) ap_queue_push_socket(fd_queue_t *queue, apr_socket_t *sd,
+                                              apr_pool_t *p)
 {
-    fd_queue_elem_t *elem;
-    timer_event_t *te;
     apr_status_t rv;
+    fd_queue_elem_t *elem;
+
+    ap_assert(sd != NULL);
 
-    if ((rv = apr_thread_mutex_lock(queue->one_big_mutex)) != APR_SUCCESS) {
+    if ((rv = queue_lock(queue)) != APR_SUCCESS) {
         return rv;
     }
 
-    /* Keep waiting until we wake up and find that the queue is not empty. */
-    if (ap_queue_empty(queue)) {
-        if (!queue->terminated) {
-            apr_thread_cond_wait(queue->not_empty, queue->one_big_mutex);
-        }
-        /* If we wake up and it's still empty, then we were interrupted */
-        if (ap_queue_empty(queue)) {
-            rv = apr_thread_mutex_unlock(queue->one_big_mutex);
-            if (rv != APR_SUCCESS) {
-                return rv;
-            }
-            if (queue->terminated) {
-                return APR_EOF; /* no more elements ever again */
-            }
-            else {
-                return APR_EINTR;
-            }
-        }
+    push_elem(queue, &elem, NULL);
+    elem->event->type = AP_QUEUE_EVENT_SOCK;
+    elem->event->data.se = &elem->self_sock_event;
+    elem->event->data.se->baton = NULL;
+    elem->event->data.se->sd = sd;
+    elem->event->data.se->p = p;
+
+    apr_thread_cond_signal(queue->not_empty);
+
+    queue_unlock(queue);
+    return APR_SUCCESS;
+}
+
+/**
+ * Pop a socket from the queue.
+ */
+AP_DECLARE(apr_status_t) ap_queue_pop_socket(fd_queue_t *queue, apr_socket_t **psd,
+                                             apr_pool_t **pp)
+{
+    apr_status_t rv;
+    fd_queue_elem_t *elem;
+
+    if (psd) {
+        *psd = NULL;
+    }
+    if (pp) {
+        *pp = NULL;
     }
 
-    te = NULL;
-    if (te_out) {
-        if (!APR_RING_EMPTY(&queue->timers, timer_event_t, link)) {
-            te = APR_RING_FIRST(&queue->timers);
-            APR_RING_REMOVE(te, link);
-        }
-        *te_out = te;
+    if ((rv = queue_lock(queue)) != APR_SUCCESS) {
+        return rv;
     }
-    if (!te) {
-        elem = &queue->data[queue->out++];
-        if (queue->out >= queue->bounds)
-            queue->out -= queue->bounds;
-        queue->nelts--;
 
-        *sd = elem->sd;
-        if (sd_baton) {
-            *sd_baton = elem->sd_baton;
+    rv = pop_elem(queue, &elem);
+    if (rv == APR_SUCCESS) {
+        ap_queue_event_t *event = elem->event;
+        ap_assert(event && event == &elem->self_event);
+        ap_assert(event->data.se == &elem->self_sock_event);
+        ap_assert(event->type == AP_QUEUE_EVENT_SOCK);
+        if (psd) {
+            *psd = event->data.se->sd;
+        }
+        if (pp) {
+            *pp = event->data.se->p;
         }
-        *p = elem->p;
-#ifdef AP_DEBUG
-        elem->sd = NULL;
-        elem->p = NULL;
-#endif /* AP_DEBUG */
+        put_spare_elem(queue, elem);
     }
 
-    return apr_thread_mutex_unlock(queue->one_big_mutex);
+    queue_unlock(queue);
+    return rv;
 }
 
 static apr_status_t queue_interrupt(fd_queue_t *queue, int all, int term)
 {
     apr_status_t rv;
 
-    if (queue->terminated) {
-        return APR_EOF;
-    }
-
-    if ((rv = apr_thread_mutex_lock(queue->one_big_mutex)) != APR_SUCCESS) {
+    if ((rv = queue_lock(queue)) != APR_SUCCESS) {
         return rv;
     }
 
@@ -505,15 +645,21 @@ static apr_status_t queue_interrupt(fd_queue_t *queue, int all, int term)
      * we could end up setting it and waking everybody up just after a
      * would-be popper checks it but right before they block
      */
+    queue->interrupted = 1;
     if (term) {
         queue->terminated = 1;
     }
-    if (all)
+    if (all) {
+        if (queue->num_waiters > 1)
+            queue->interrupted += queue->num_waiters - 1;
         apr_thread_cond_broadcast(queue->not_empty);
-    else
+    }
+    else {
         apr_thread_cond_signal(queue->not_empty);
+    }
 
-    return apr_thread_mutex_unlock(queue->one_big_mutex);
+    queue_unlock(queue);
+    return APR_SUCCESS;
 }
 
 apr_status_t ap_queue_interrupt_all(fd_queue_t *queue)
diff --git a/server/mpm_fdqueue.h b/server/mpm_fdqueue.h
index 260e22ab80e..29297fd60d5 100644
--- a/server/mpm_fdqueue.h
+++ b/server/mpm_fdqueue.h
@@ -27,7 +27,7 @@
 
 #include <apr.h>
 
-/* This code is not AP_DECLARE()ed/exported, and used by MPMs event/worker
+/* This code is AP_DECLARE()ed/exportedbut  used by MPMs event/worker
  * only (for now), not worth thinking about w/o threads either...
  */
 #if APR_HAS_THREADS
@@ -40,28 +40,48 @@
 #include <apr_thread_cond.h>
 #include <apr_network_io.h>
 
+struct fd_queue_t;      /* opaque */
 struct fd_queue_info_t; /* opaque */
 struct fd_queue_elem_t; /* opaque */
+typedef struct fd_queue_t fd_queue_t;
 typedef struct fd_queue_info_t fd_queue_info_t;
 typedef struct fd_queue_elem_t fd_queue_elem_t;
 
 AP_DECLARE(apr_status_t) ap_queue_info_create(fd_queue_info_t **queue_info,
-                                              apr_pool_t *pool, int max_idlers,
-                                              int max_recycled_pools);
+                                              apr_pool_t *pool, int max_recycled_pools);
 AP_DECLARE(apr_status_t) ap_queue_info_set_idle(fd_queue_info_t *queue_info,
                                                 apr_pool_t *pool_to_recycle);
 AP_DECLARE(apr_status_t) ap_queue_info_try_get_idler(fd_queue_info_t *queue_info);
-AP_DECLARE(apr_status_t) ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info,
-                                                      int *had_to_block);
+AP_DECLARE(apr_status_t) ap_queue_info_wait_for_idler(fd_queue_info_t *queue_info);
 AP_DECLARE(apr_uint32_t) ap_queue_info_num_idlers(fd_queue_info_t *queue_info);
 AP_DECLARE(apr_status_t) ap_queue_info_term(fd_queue_info_t *queue_info);
 
-AP_DECLARE(void) ap_queue_info_pop_pool(fd_queue_info_t *queue_info,
-                                        apr_pool_t **recycled_pool);
+/* Async API */
+AP_DECLARE(apr_int32_t) ap_queue_info_idlers_inc(fd_queue_info_t *queue_info);
+AP_DECLARE(apr_int32_t) ap_queue_info_idlers_dec(fd_queue_info_t *queue_info);
+AP_DECLARE(apr_int32_t) ap_queue_info_idlers_count(fd_queue_info_t *queue_info);
+
+AP_DECLARE(apr_pool_t *) ap_queue_info_pop_pool(fd_queue_info_t *queue_info);
 AP_DECLARE(void) ap_queue_info_push_pool(fd_queue_info_t *queue_info,
                                          apr_pool_t *pool_to_recycle);
 AP_DECLARE(void) ap_queue_info_free_idle_pools(fd_queue_info_t *queue_info);
 
+enum ap_queue_event_type_e
+{
+    AP_QUEUE_EVENT_SOCK,
+    AP_QUEUE_EVENT_TIMER,
+    AP_QUEUE_EVENT_BATON,
+};
+typedef enum ap_queue_event_type_e ap_queue_event_type_e;
+
+struct sock_event_t
+{
+    apr_pool_t *p;
+    apr_socket_t *sd;
+    void *baton;
+};
+typedef struct sock_event_t sock_event_t;
+
 struct timer_event_t
 {
     APR_RING_ENTRY(timer_event_t) link;
@@ -74,33 +94,47 @@ struct timer_event_t
 };
 typedef struct timer_event_t timer_event_t;
 
-struct fd_queue_t
+struct ap_queue_event_t
 {
-    APR_RING_HEAD(timers_t, timer_event_t) timers;
-    fd_queue_elem_t *data;
-    unsigned int nelts;
-    unsigned int bounds;
-    unsigned int in;
-    unsigned int out;
-    apr_thread_mutex_t *one_big_mutex;
-    apr_thread_cond_t *not_empty;
-    volatile int terminated;
+    /* event data */
+    ap_queue_event_type_e type;
+    union {
+        sock_event_t *se;
+        timer_event_t *te;
+        void *baton;
+    } data;
+
+    /* called back when the event is pushed/popped,
+     * under the queue lock (must not block!)
+     */
+    void (*cb)(void *baton, int pushed);
+    void *cb_baton;
+
+    /* link in container when queued (for internal use) */
+    fd_queue_elem_t *elem;
 };
-typedef struct fd_queue_t fd_queue_t;
+typedef struct ap_queue_event_t ap_queue_event_t;
+
+AP_DECLARE(apr_status_t) ap_queue_create(fd_queue_t **pqueue, int capacity,
+                                         apr_pool_t *p);
+
+/* mpm_event API (queue of any event) */
+AP_DECLARE(apr_status_t) ap_queue_push_event(fd_queue_t *queue,
+                                             ap_queue_event_t *event);
+AP_DECLARE(apr_status_t) ap_queue_pop_event(fd_queue_t *queue,
+                                            ap_queue_event_t **pevent);
+AP_DECLARE(apr_status_t) ap_queue_lock(fd_queue_t *queue);
+AP_DECLARE(void) ap_queue_kill_event_locked(fd_queue_t *queue,
+                                            ap_queue_event_t *event);
+AP_DECLARE(apr_status_t) ap_queue_unlock(fd_queue_t *queue);
 
-AP_DECLARE(apr_status_t) ap_queue_create(fd_queue_t **pqueue,
-                                         int capacity, apr_pool_t *p);
-AP_DECLARE(apr_status_t) ap_queue_push_socket(fd_queue_t *queue,
-                                              apr_socket_t *sd, void *sd_baton,
+/* mpm_worker API (queue of socket_event_t only) */
+AP_DECLARE(apr_status_t) ap_queue_push_socket(fd_queue_t *queue, apr_socket_t *sd,
                                               apr_pool_t *p);
-AP_DECLARE(apr_status_t) ap_queue_push_timer(fd_queue_t *queue,
-                                             timer_event_t *te);
-AP_DECLARE(apr_status_t) ap_queue_pop_something(fd_queue_t *queue,
-                                                apr_socket_t **sd, void **sd_baton,
-                                                apr_pool_t **p, timer_event_t **te);
-#define                  ap_queue_pop_socket(q_, s_, p_) \
-                            ap_queue_pop_something((q_), (s_), NULL, (p_), NULL)
+AP_DECLARE(apr_status_t) ap_queue_pop_socket(fd_queue_t *queue, apr_socket_t **psd,
+                                             apr_pool_t **pp);
 
+/* common API */
 AP_DECLARE(apr_status_t) ap_queue_interrupt_all(fd_queue_t *queue);
 AP_DECLARE(apr_status_t) ap_queue_interrupt_one(fd_queue_t *queue);
 AP_DECLARE(apr_status_t) ap_queue_term(fd_queue_t *queue);

From aa04f2aab4588075f0f63dc9b19d19f264a7dbfe Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Fri, 7 Jul 2023 13:04:42 +0200
Subject: [PATCH 11/22] core,mpm_event: Non blocking shutdown.

---
 include/http_connection.h       |   9 ++-
 include/scoreboard.h            |   1 +
 modules/generators/mod_status.c |  17 +++--
 modules/lua/lua_request.c       |   4 ++
 server/connection.c             |  23 +++---
 server/mpm/event/event.c        | 119 ++++++++++++++++++++++----------
 6 files changed, 120 insertions(+), 53 deletions(-)

diff --git a/include/http_connection.h b/include/http_connection.h
index 601a4769109..78371efbb27 100644
--- a/include/http_connection.h
+++ b/include/http_connection.h
@@ -43,10 +43,15 @@ extern "C" {
  */
 AP_CORE_DECLARE(void) ap_process_connection(conn_rec *c, void *csd);
 
+#define AP_SHUTDOWN_CONN_NOFLUSH 0
+#define AP_SHUTDOWN_CONN_FLUSH   1
+#define AP_SHUTDOWN_CONN_WC      2
+
 /**
  * Shutdown the connection for writing.
  * @param c The connection to shutdown
- * @param flush Whether or not to flush pending data before
+ * @param flush Whether to flush pending data before, and if so how to
+ *              (AP_SHUTDOWN_CONN_* flags)
  * @return APR_SUCCESS or the underlying error
  */
 AP_CORE_DECLARE(apr_status_t) ap_shutdown_conn(conn_rec *c, int flush);
@@ -54,7 +59,7 @@ AP_CORE_DECLARE(apr_status_t) ap_shutdown_conn(conn_rec *c, int flush);
 /**
  * Flushes all remain data in the client send buffer
  * @param c The connection to flush
- * @remark calls ap_shutdown_conn(c, 1)
+ * @remark calls ap_shutdown_conn(c, AP_SHUTDOWN_CONN_FLUSH)
  */
 AP_CORE_DECLARE(void) ap_flush_conn(conn_rec *c);
 
diff --git a/include/scoreboard.h b/include/scoreboard.h
index e83e52fdb16..581f86b866c 100644
--- a/include/scoreboard.h
+++ b/include/scoreboard.h
@@ -149,6 +149,7 @@ struct process_score {
     apr_uint32_t keep_alive;        /* async connections in keep alive */
     apr_uint32_t suspended;         /* connections suspended by some module */
     apr_uint32_t wait_io;           /* async connections waiting an IO in the MPM */
+    apr_uint32_t shutdown;          /* async connections shutting down before close */
     apr_uint32_t backlog;           /* async connections waiting for a worker */
 };
 
diff --git a/modules/generators/mod_status.c b/modules/generators/mod_status.c
index f0cff67ac45..5ff635cc96e 100644
--- a/modules/generators/mod_status.c
+++ b/modules/generators/mod_status.c
@@ -564,8 +564,8 @@ static int status_handler(request_rec *r)
         ap_rputs("</dl>", r);
 
     if (is_async) {
-        int wait_io = 0, write_completion = 0, lingering_close = 0, keep_alive = 0,
-            connections = 0, stopping = 0, procs = 0;
+        int wait_io = 0, write_completion = 0, shutdown = 0, lingering_close = 0,
+            keep_alive = 0, connections = 0, stopping = 0, procs = 0;
         if (!short_report)
             ap_rputs("\n\n<table rules=\"all\" cellpadding=\"1%\">\n"
                      "<tr><th rowspan=\"2\">Slot</th>"
@@ -577,7 +577,7 @@ static int status_handler(request_rec *r)
                      "<tr><th>total</th><th>accepting</th>"
                          "<th>busy</th><th>graceful</th><th>idle</th>"
                          "<th>wait-io</th><th>writing</th><th>keep-alive</th>"
-                         "<th>closing</th></tr>\n", r);
+                         "<th>shutdown</th><th>closing</th></tr>\n", r);
         for (i = 0; i < server_limit; ++i) {
             ps_record = ap_get_scoreboard_process(i);
             if (ps_record->pid) {
@@ -585,6 +585,7 @@ static int status_handler(request_rec *r)
                 wait_io          += ps_record->wait_io;
                 write_completion += ps_record->write_completion;
                 keep_alive       += ps_record->keep_alive;
+                shutdown         += ps_record->shutdown;
                 lingering_close  += ps_record->lingering_close;
                 procs++;
                 if (ps_record->quiescing) {
@@ -601,7 +602,7 @@ static int status_handler(request_rec *r)
                     ap_rprintf(r, "<tr><td>%u</td><td>%" APR_PID_T_FMT "</td>"
                                       "<td>%s%s</td>"
                                       "<td>%u</td><td>%s</td>"
-                                      "<td>%u</td><td>%u</td><td>%u</td>"
+                                      "<td>%u</td><td>%u</td><td>%u</td><td>%u</td>"
                                       "<td>%u</td><td>%u</td><td>%u</td><td>%u</td>"
                                       "</tr>\n",
                                i, ps_record->pid,
@@ -614,6 +615,7 @@ static int status_handler(request_rec *r)
                                ps_record->wait_io,
                                ps_record->write_completion,
                                ps_record->keep_alive,
+                               ps_record->shutdown,
                                ps_record->lingering_close);
                 }
             }
@@ -622,14 +624,14 @@ static int status_handler(request_rec *r)
             ap_rprintf(r, "<tr><td>Sum</td>"
                           "<td>%d</td><td>%d</td>"
                           "<td>%d</td><td>&nbsp;</td>"
-                          "<td>%d</td><td>%d</td><td>%d</td>"
+                          "<td>%d</td><td>%d</td><td>%d</td><td>%d</td>"
                           "<td>%d</td><td>%d</td><td>%d</td><td>%d</td>"
                           "</tr>\n</table>\n",
                           procs, stopping,
                           connections,
                           busy, graceful, idle,
                           wait_io, write_completion, keep_alive,
-                          lingering_close);
+                          shutdown, lingering_close);
         }
         else {
             ap_rprintf(r, "Processes: %d\n"
@@ -638,11 +640,12 @@ static int status_handler(request_rec *r)
                           "ConnsAsyncWaitIO: %d\n"
                           "ConnsAsyncWriting: %d\n"
                           "ConnsAsyncKeepAlive: %d\n"
+                          "ConnsAsyncShutdown: %d\n"
                           "ConnsAsyncClosing: %d\n",
                           procs, stopping,
                           connections,
                           wait_io, write_completion, keep_alive,
-                          lingering_close);
+                          shutdown, lingering_close);
         }
     }
 
diff --git a/modules/lua/lua_request.c b/modules/lua/lua_request.c
index 5fa3a968c6b..f93c3493af4 100644
--- a/modules/lua/lua_request.c
+++ b/modules/lua/lua_request.c
@@ -1276,6 +1276,10 @@ static int lua_ap_scoreboard_process(lua_State *L)
         lua_pushnumber(L, ps_record->write_completion);
         lua_settable(L, -3);
 
+        lua_pushstring(L, "shutdown");
+        lua_pushnumber(L, ps_record->shutdown);
+        lua_settable(L, -3);
+
         lua_pushstring(L, "not_accepting");
         lua_pushnumber(L, ps_record->not_accepting);
         lua_settable(L, -3);
diff --git a/server/connection.c b/server/connection.c
index a1c4c1860f0..383b769660f 100644
--- a/server/connection.c
+++ b/server/connection.c
@@ -111,37 +111,42 @@ AP_CORE_DECLARE(apr_status_t) ap_shutdown_conn(conn_rec *c, int flush)
     apr_bucket_brigade *bb;
     apr_bucket *b;
 
-    bb = apr_brigade_create(c->pool, c->bucket_alloc);
+    bb = ap_acquire_brigade(c);
 
-    if (flush) {
+    if (flush == AP_SHUTDOWN_CONN_WC) {
+        /* Write Completion bucket */
+        b = ap_bucket_wc_create(c->bucket_alloc);
+    }
+    else {
         /* FLUSH bucket */
         b = apr_bucket_flush_create(c->bucket_alloc);
-        APR_BRIGADE_INSERT_TAIL(bb, b);
     }
+    APR_BRIGADE_INSERT_TAIL(bb, b);
 
     /* End Of Connection bucket */
     b = ap_bucket_eoc_create(c->bucket_alloc);
     APR_BRIGADE_INSERT_TAIL(bb, b);
 
     rv = ap_pass_brigade(c->output_filters, bb);
-    apr_brigade_destroy(bb);
+    ap_release_brigade(c, bb);
     return rv;
 }
 
 AP_CORE_DECLARE(void) ap_flush_conn(conn_rec *c)
 {
-    (void)ap_shutdown_conn(c, 1);
+    (void)ap_shutdown_conn(c, AP_SHUTDOWN_CONN_FLUSH);
 }
 
 AP_DECLARE(int) ap_prep_lingering_close(conn_rec *c)
 {
     /* Give protocol handlers one last chance to raise their voice */
-    ap_run_pre_close_connection(c);
+    int rc = ap_run_pre_close_connection(c);
     
     if (c->sbh) {
         ap_update_child_status(c->sbh, SERVER_CLOSING, NULL);
     }
-    return 0;
+
+    return (rc == DECLINED) ? OK : rc;
 }
 
 /* we now proceed to read from the client until we get EOF, or until
@@ -172,7 +177,9 @@ AP_DECLARE(int) ap_start_lingering_close(conn_rec *c)
      */
 
     /* Send any leftover data to the client, but never try to again */
-    ap_flush_conn(c);
+    if (ap_shutdown_conn(c, AP_SHUTDOWN_CONN_FLUSH)) {
+        return 1;
+    }
 
 #ifdef NO_LINGCLOSE
     return 1;
diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index 5a9f4b676b4..8c5bee23115 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -301,7 +301,9 @@ struct event_conn_state_t {
         /** Is lingering close from defer_lingering_close()? */
         deferred_linger :1,
         /** Has ap_start_lingering_close() been called? */
-        linger_started  :1;
+        linger_started  :1,
+        /** Is lingering connection flushed and shutdown? */
+        linger_shutdown :1;
 };
 #define cs_se(cs) (&(cs)->bse.se)
 #define cs_qe(cs) (&(cs)->bse.qe)
@@ -455,6 +457,7 @@ struct timeout_queue {
  *   waitio_q           uses vhost's TimeOut
  *   write_completion_q uses vhost's TimeOut
  *   keepalive_q        uses vhost's KeepAliveTimeOut
+ *   shutdown_q         uses vhost's TimeOut
  *   linger_q           uses MAX_SECS_TO_LINGER
  *   short_linger_q     uses SECONDS_TO_LINGER
  *   backlog_q          uses vhost's TimeOut
@@ -462,6 +465,7 @@ struct timeout_queue {
 static struct timeout_queue *waitio_q,           /* wait for I/O to happen */
                             *write_completion_q, /* completion or user async poll */
                             *keepalive_q,        /* in between requests */
+                            *shutdown_q,         /* shutting down (write) before close */
                             *linger_q,           /* lingering (read) before close */
                             *short_linger_q,     /* lingering (read) before close (short timeout) */
                             *backlog_q;          /* waiting for a worker */
@@ -658,6 +662,7 @@ struct event_srv_cfg_s {
     struct timeout_queue *io_q,
                          *wc_q,
                          *ka_q,
+                         *sh_q,
                          *bl_q;
     server_rec *s; /* backref */
 };
@@ -724,14 +729,15 @@ static int disable_listensocks(void)
 
     ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(10381)
                  "Suspend listening sockets: idlers:%i conns:%u backlog:%u "
-                 "waitio:%u write:%u keepalive:%u linger:%u/%u "
-                 "timers:%u suspended:%u",
+                 "waitio:%u write:%u keepalive:%u shutdown:%u "
+                 "linger:%u/%u timers:%u suspended:%u",
                  ap_queue_info_idlers_count(worker_queue_info),
                  apr_atomic_read32(&connection_count),
                  apr_atomic_read32(backlog_q->total),
                  apr_atomic_read32(waitio_q->total),
                  apr_atomic_read32(write_completion_q->total),
                  apr_atomic_read32(keepalive_q->total),
+                 apr_atomic_read32(shutdown_q->total),
                  apr_atomic_read32(linger_q->total),
                  apr_atomic_read32(short_linger_q->total),
                  apr_atomic_read32(&timers_count),
@@ -756,14 +762,15 @@ static int enable_listensocks(void)
 
     ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(00457)
                  "Resume listening sockets: idlers:%i conns:%u backlog:%u "
-                 "waitio:%u write:%u keepalive:%u linger:%u/%u "
-                 "timers:%u suspended:%u",
+                 "waitio:%u write:%u keepalive:%u shutdown:%u "
+                 "linger:%u/%u timers:%u suspended:%u",
                  ap_queue_info_idlers_count(worker_queue_info),
                  apr_atomic_read32(&connection_count),
                  apr_atomic_read32(backlog_q->total),
                  apr_atomic_read32(waitio_q->total),
                  apr_atomic_read32(write_completion_q->total),
                  apr_atomic_read32(keepalive_q->total),
+                 apr_atomic_read32(shutdown_q->total),
                  apr_atomic_read32(linger_q->total),
                  apr_atomic_read32(short_linger_q->total),
                  apr_atomic_read32(&timers_count),
@@ -1917,7 +1924,7 @@ static void push2worker(event_conn_state_t *cs, timer_event_t *te,
         ap_assert(!cs_in_backlog(cs));
         ap_assert(!cs->q);
 
-        if (busy && cs->pub.state == CONN_STATE_LINGER && cs->linger_started) {
+        if (busy && cs->pub.state == CONN_STATE_LINGER && cs->linger_shutdown) {
             /* Not worth lingering more on this connection if we are short of
              * workers and everything is flushed+shutdown already, back out
              * and close.
@@ -2201,19 +2208,53 @@ static void process_lingering_close(event_conn_state_t *cs)
     AP_DEBUG_ASSERT(CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state));
 
     /* Flush and shutdown first */
-    if (!cs->linger_started) {
-        cs->linger_started = 1; /* once! */
-        apr_atomic_inc32(&lingering_count);
+    if (!cs->linger_shutdown) {
+        conn_rec *c = cs->c;
+        int rc = OK;
+
         cs->pub.state = CONN_STATE_LINGER;
 
-        apr_socket_timeout_set(csd, apr_time_from_sec(SECONDS_TO_LINGER));
-        if (ap_start_lingering_close(cs->c)) {
+        if (!cs->linger_started) {
+            cs->linger_started = 1; /* once! */
+            apr_atomic_inc32(&lingering_count);
             notify_suspend(cs);
+
+            /* Shutdown the connection, i.e. pre_connection_close hooks,
+             * SSL/TLS close notify, WC bucket, etc..
+             */
+            rc = ap_prep_lingering_close(c);
+            if (rc == OK) {
+                rc = ap_shutdown_conn(c, AP_SHUTDOWN_CONN_WC);
+                if (rc == OK) {
+                    if (c->aborted) {
+                        rc = DONE;
+                    }
+                    else if (ap_filter_should_yield(c->output_filters)) {
+                        rc = AGAIN;
+                    }
+                }
+            }
+        }
+        else {
+            rc = ap_check_output_pending(c);
+        }
+
+        cs->pub.state = CONN_STATE_LINGER;
+        cs->pub.sense = CONN_SENSE_DEFAULT;
+        if (rc == AGAIN) {
+            ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
+                          "queuing lingering close for connection %" CS_FMT,
+                          CS_ARG(cs));
+            if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->sh_q, NULL)) {
+                return; /* queued */
+            }
+        }
+        if (rc != OK || apr_socket_shutdown(csd, APR_SHUTDOWN_WRITE)) {
             close_connection(cs);
             return;
         }
-
-        notify_suspend(cs);
+        
+        cs->linger_shutdown = 1; /* once! */
 
         /* All nonblocking from now, no need for APR_INCOMPLETE_READ either */
         apr_socket_timeout_set(csd, 0);
@@ -2230,7 +2271,6 @@ static void process_lingering_close(event_conn_state_t *cs)
         else {
             cs->pub.state = CONN_STATE_LINGER_NORMAL;
         }
-        cs->pub.sense = CONN_SENSE_DEFAULT;
     }
 
     /* Drain until EAGAIN or EOF/error, in the former case requeue and
@@ -2729,32 +2769,30 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             /* Recompute this by walking the timeout queues (under the lock) */
             queues_next_expiry = 0;
 
-            /* Step 1: keepalive queue timeouts */
+            /* Process shutdown_q first because the expired entries from the
+             * other queues will go there and don't need to be checked twice
+             * (nor do we want to potentially kill them before the shutdown).
+             */
+            process_timeout_queue(shutdown_q, now);
+
+            process_timeout_queue(waitio_q, now);
+            process_timeout_queue(write_completion_q, now);
+
+            /* The linger and keepalive queues can be shrinked any time
+             * under pressure.
+             */
             if (workers_were_busy || dying) {
+                shrink_timeout_queue(linger_q, now);
+                shrink_timeout_queue(short_linger_q, now);
                 shrink_timeout_queue(keepalive_q, now);
             }
             else {
+                process_timeout_queue(linger_q, now);
+                process_timeout_queue(short_linger_q, now);
                 process_timeout_queue(keepalive_q, now);
             }
 
-            /* Step 2: waitio queue timeouts */
-            process_timeout_queue(waitio_q, now);
-
-            /* Step 3: write completion queue timeouts */
-            process_timeout_queue(write_completion_q, now);
-
-            /* Step 4: normal lingering close queue timeouts */
-            if (dying && linger_q->timeout > short_linger_q->timeout) {
-                /* Dying, force short timeout for normal lingering close */
-                linger_q->timeout = short_linger_q->timeout;
-            }
-            process_timeout_queue(linger_q, now);
-
-            /* Step 5: short lingering close queue timeouts */
-            process_timeout_queue(short_linger_q, now);
-
-            /* Step 6: backlog queue timeouts
-             * Connections in backlog race with the workers (dequeuing) under
+            /* Connections in backlog race with the workers (dequeuing) under
              * the worker_queue mutex.
              */
             if (apr_atomic_read32(backlog_q->total)) {
@@ -2773,14 +2811,19 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             ps->wait_io = apr_atomic_read32(waitio_q->total);
             ps->write_completion = apr_atomic_read32(write_completion_q->total);
             ps->keep_alive = apr_atomic_read32(keepalive_q->total);
+            ps->shutdown = apr_atomic_read32(shutdown_q->total);
             ps->lingering_close = apr_atomic_read32(&lingering_count);
             ps->backlog = apr_atomic_read32(backlog_q->total);
             ps->suspended = apr_atomic_read32(&suspended_count);
             ps->connections = apr_atomic_read32(&connection_count);
         }
         else if ((workers_were_busy || dying)
-                 && apr_atomic_read32(keepalive_q->total)) {
+                 && (apr_atomic_read32(linger_q->total)
+                     || apr_atomic_read32(short_linger_q->total)
+                     || apr_atomic_read32(keepalive_q->total))) {
             apr_thread_mutex_lock(timeout_mutex);
+            shrink_timeout_queue(linger_q, now);
+            shrink_timeout_queue(short_linger_q, now);
             shrink_timeout_queue(keepalive_q, now);
             apr_thread_mutex_unlock(timeout_mutex);
             ps->keep_alive = 0;
@@ -4444,7 +4487,7 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog,
 static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog,
                              apr_pool_t *ptemp, server_rec *s)
 {
-    apr_hash_t *io_h, *wc_h, *ka_h, *bl_h;
+    apr_hash_t *io_h, *wc_h, *ka_h, *sh_h, *bl_h;
 
     /* Not needed in pre_config stage */
     if (ap_state_query(AP_SQ_MAIN_STATE) == AP_SQ_MS_CREATE_PRE_CONFIG) {
@@ -4454,6 +4497,7 @@ static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog,
     io_h = apr_hash_make(ptemp);
     wc_h = apr_hash_make(ptemp);
     ka_h = apr_hash_make(ptemp);
+    sh_h = apr_hash_make(ptemp);
     bl_h = apr_hash_make(ptemp);
 
     linger_q = TO_QUEUE_MAKE(pconf, "linger",
@@ -4475,8 +4519,11 @@ static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog,
         sc->ka_q = TO_QUEUE_CHAIN(pconf, "keepalive", s->keep_alive_timeout,
                                   &keepalive_q, ka_h, ptemp);
 
+        sc->sh_q = TO_QUEUE_CHAIN(pconf, "shutdown", s->timeout,
+                                  &shutdown_q, sh_h, ptemp);
+
         sc->bl_q = TO_QUEUE_CHAIN(pconf, "backlog", s->timeout,
-                                       &backlog_q, bl_h, ptemp);
+                                  &backlog_q, bl_h, ptemp);
     }
 
     return OK;

From 364a3894b3b6c80211d615c6722f7607c3fe9d82 Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Wed, 10 Jul 2024 15:08:28 +0200
Subject: [PATCH 12/22] mpm_event: Don't shrink keepalive queue when
 busy/exiting.

---
 server/mpm/event/event.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index 8c5bee23115..f341f1daf87 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -1669,7 +1669,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
                 goto process_connection;
             }
         }
-        if (pending != OK || listener_may_exit) {
+        if (pending != OK) {
             cs->pub.state = CONN_STATE_LINGER;
             goto lingering_close;
         }
@@ -2280,7 +2280,7 @@ static void process_lingering_close(event_conn_state_t *cs)
         apr_size_t nbytes = sizeof(dummybuf);
         rv = apr_socket_recv(csd, dummybuf, &nbytes);
     } while (rv == APR_SUCCESS);
-    if (APR_STATUS_IS_EAGAIN(rv)) {
+    if (APR_STATUS_IS_EAGAIN(rv) && !listensocks_disabled()) {
         struct timeout_queue *q;
         q = (cs->pub.state == CONN_STATE_LINGER_SHORT) ? short_linger_q : linger_q;
         if (pollset_add(cs, CONN_SENSE_WANT_READ, q, NULL)) {
@@ -2777,19 +2777,16 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
 
             process_timeout_queue(waitio_q, now);
             process_timeout_queue(write_completion_q, now);
+            process_timeout_queue(keepalive_q, now);
 
-            /* The linger and keepalive queues can be shrinked any time
-             * under pressure.
-             */
+            /* The linger queues can be shrinked any time under pressure */
             if (workers_were_busy || dying) {
                 shrink_timeout_queue(linger_q, now);
                 shrink_timeout_queue(short_linger_q, now);
-                shrink_timeout_queue(keepalive_q, now);
             }
             else {
                 process_timeout_queue(linger_q, now);
                 process_timeout_queue(short_linger_q, now);
-                process_timeout_queue(keepalive_q, now);
             }
 
             /* Connections in backlog race with the workers (dequeuing) under
@@ -2819,14 +2816,11 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
         }
         else if ((workers_were_busy || dying)
                  && (apr_atomic_read32(linger_q->total)
-                     || apr_atomic_read32(short_linger_q->total)
-                     || apr_atomic_read32(keepalive_q->total))) {
+                     || apr_atomic_read32(short_linger_q->total))) {
             apr_thread_mutex_lock(timeout_mutex);
             shrink_timeout_queue(linger_q, now);
             shrink_timeout_queue(short_linger_q, now);
-            shrink_timeout_queue(keepalive_q, now);
             apr_thread_mutex_unlock(timeout_mutex);
-            ps->keep_alive = 0;
         }
     } /* listener main loop */
 

From eb1eb7fb894dab129efd0f1181f3e2fd1a95ef74 Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Mon, 26 Jun 2023 19:26:58 +0200
Subject: [PATCH 13/22] mpm_event: Single linger queue/timeout (short one, 2s).

---
 server/mpm/event/event.c | 128 +++++++++++++--------------------------
 1 file changed, 41 insertions(+), 87 deletions(-)

diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index f341f1daf87..0058ba20994 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -155,12 +155,8 @@
 #define apr_time_from_msec(x) ((x) * 1000)
 #endif
 
-#define CONN_STATE_IS_LINGERING_CLOSE(s) ((s) >= CONN_STATE_LINGER && \
-                                          (s) <= CONN_STATE_LINGER_SHORT)
-#ifndef MAX_SECS_TO_LINGER
-#define MAX_SECS_TO_LINGER 30
-#endif
-#define SECONDS_TO_LINGER  2
+/* Lingering close (read) timeout */
+#define LINGER_READ_TIMEOUT     apr_time_from_sec(2)
 
 /* Don't wait more time in poll() if APR_POLLSET_WAKEABLE is not implemented */
 #define NON_WAKEABLE_TIMEOUT    apr_time_from_msec(100)
@@ -204,7 +200,6 @@ static volatile int start_thread_may_exit = 0;
 static volatile int listener_may_exit = 0;
 static apr_uint32_t connection_count = 0;   /* Number of open connections */
 static apr_uint32_t timers_count = 0;       /* Number of queued timers */
-static apr_uint32_t lingering_count = 0;    /* Number of connections in lingering close */
 static apr_uint32_t suspended_count = 0;    /* Number of suspended connections */
 static apr_uint32_t threads_shutdown = 0;   /* Number of threads that have shutdown
                                                early during graceful termination */
@@ -458,8 +453,7 @@ struct timeout_queue {
  *   write_completion_q uses vhost's TimeOut
  *   keepalive_q        uses vhost's KeepAliveTimeOut
  *   shutdown_q         uses vhost's TimeOut
- *   linger_q           uses MAX_SECS_TO_LINGER
- *   short_linger_q     uses SECONDS_TO_LINGER
+ *   linger_q           uses LINGER_READ_TIMEOUT
  *   backlog_q          uses vhost's TimeOut
  */
 static struct timeout_queue *waitio_q,           /* wait for I/O to happen */
@@ -467,7 +461,6 @@ static struct timeout_queue *waitio_q,           /* wait for I/O to happen */
                             *keepalive_q,        /* in between requests */
                             *shutdown_q,         /* shutting down (write) before close */
                             *linger_q,           /* lingering (read) before close */
-                            *short_linger_q,     /* lingering (read) before close (short timeout) */
                             *backlog_q;          /* waiting for a worker */
 static volatile apr_time_t queues_next_expiry; /* next expiry time accross all queues */
 
@@ -730,7 +723,7 @@ static int disable_listensocks(void)
     ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(10381)
                  "Suspend listening sockets: idlers:%i conns:%u backlog:%u "
                  "waitio:%u write:%u keepalive:%u shutdown:%u "
-                 "linger:%u/%u timers:%u suspended:%u",
+                 "linger:%u timers:%u suspended:%u",
                  ap_queue_info_idlers_count(worker_queue_info),
                  apr_atomic_read32(&connection_count),
                  apr_atomic_read32(backlog_q->total),
@@ -739,7 +732,6 @@ static int disable_listensocks(void)
                  apr_atomic_read32(keepalive_q->total),
                  apr_atomic_read32(shutdown_q->total),
                  apr_atomic_read32(linger_q->total),
-                 apr_atomic_read32(short_linger_q->total),
                  apr_atomic_read32(&timers_count),
                  apr_atomic_read32(&suspended_count));
 
@@ -763,7 +755,7 @@ static int enable_listensocks(void)
     ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, APLOGNO(00457)
                  "Resume listening sockets: idlers:%i conns:%u backlog:%u "
                  "waitio:%u write:%u keepalive:%u shutdown:%u "
-                 "linger:%u/%u timers:%u suspended:%u",
+                 "linger:%u timers:%u suspended:%u",
                  ap_queue_info_idlers_count(worker_queue_info),
                  apr_atomic_read32(&connection_count),
                  apr_atomic_read32(backlog_q->total),
@@ -772,7 +764,6 @@ static int enable_listensocks(void)
                  apr_atomic_read32(keepalive_q->total),
                  apr_atomic_read32(shutdown_q->total),
                  apr_atomic_read32(linger_q->total),
-                 apr_atomic_read32(short_linger_q->total),
                  apr_atomic_read32(&timers_count),
                  apr_atomic_read32(&suspended_count));
 
@@ -798,7 +789,7 @@ static APR_INLINE int connections_above_limit(int *busy)
     apr_int32_t i_count = ap_queue_info_idlers_count(worker_queue_info);
     if (i_count > 0) {
         apr_uint32_t c_count = apr_atomic_read32(&connection_count);
-        apr_uint32_t l_count = apr_atomic_read32(&lingering_count);
+        apr_uint32_t l_count = apr_atomic_read32(linger_q->total);
         if (c_count <= l_count
                 /* Off by 'listensocks_disabled()' to avoid flip flop */
                 || c_count - l_count < (apr_uint32_t)threads_per_child +
@@ -1092,17 +1083,12 @@ static apr_status_t decrement_connection_count(void *cs_)
                  CS_ARG_TO(cs));
 
     switch (cs->pub.state) {
-        case CONN_STATE_LINGER:
-        case CONN_STATE_LINGER_NORMAL:
-        case CONN_STATE_LINGER_SHORT:
-            apr_atomic_dec32(&lingering_count);
-            break;
-        case CONN_STATE_SUSPENDED:
-            apr_atomic_dec32(&suspended_count);
-            break;
-        default:
-            break;
+    case CONN_STATE_SUSPENDED:
+        apr_atomic_dec32(&suspended_count);
+    default:
+        break;
     }
+
     /* Unblock the listener if it's waiting for connection_count = 0,
      * or if the listening sockets were disabled due to limits and can
      * now accept new connections.
@@ -1185,7 +1171,7 @@ static void push2worker(event_conn_state_t *cs, timer_event_t *te,
                         apr_time_t now, int *busy);
 
 /* Shutdown the connection in case of timeout, error or resources shortage.
- * This starts short lingering close if not already there, or directly closes
+ * This starts lingering close if not already there, or directly closes
  * the connection otherwise.
  * Pre-condition: nonblocking, can be called from anywhere provided cs is not
  *                in the pollset nor any non-backlog timeout queue.
@@ -1199,8 +1185,6 @@ static void shutdown_connection(event_conn_state_t *cs, apr_time_t now,
         int log_level = APLOG_INFO;
         switch (cs->pub.state) {
         case CONN_STATE_LINGER:
-        case CONN_STATE_LINGER_NORMAL:
-        case CONN_STATE_LINGER_SHORT:
         case CONN_STATE_KEEPALIVE:
             log_level = APLOG_TRACE2;
         default:
@@ -1214,8 +1198,7 @@ static void shutdown_connection(event_conn_state_t *cs, apr_time_t now,
         /* Don't re-schedule connections in lingering close, they had
          * their chance already so just close them now.
          */
-        if (!CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) {
-            apr_table_setn(cs->c->notes, "short-lingering-close", "1");
+        if (cs->pub.state != CONN_STATE_LINGER) {
             cs->pub.state = CONN_STATE_LINGER;
             push2worker(cs, NULL, now, NULL);
         }
@@ -1530,7 +1513,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
                   "processing connection %" CS_FMT " (aborted %d, clogging %d)",
                   CS_ARG(cs), c->aborted, c->clogging_input_filters);
 
-    if (CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) {
+    if (cs->pub.state == CONN_STATE_LINGER) {
         goto lingering_close;
     }
 
@@ -1628,7 +1611,6 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
             q = cs->sc->io_q;
         }
         if (!pollset_add(cs, CONN_SENSE_WANT_READ, q, te)) {
-            apr_table_setn(cs->c->notes, "short-lingering-close", "1");
             cs->pub.state = CONN_STATE_LINGER;
             goto lingering_close;
         }
@@ -1658,7 +1640,6 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
                 return; /* queued */
             }
             /* Fall through lingering close */
-            apr_table_setn(cs->c->notes, "short-lingering-close", "1");
         }
         else if (pending == OK) {
             /* Some data to process immediately? */
@@ -1692,7 +1673,6 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
         notify_suspend(cs);
 
         if (!pollset_add(cs, CONN_SENSE_WANT_READ, cs->ka_sc->ka_q, NULL)) {
-            apr_table_setn(cs->c->notes, "short-lingering-close", "1");
             cs->pub.state = CONN_STATE_LINGER;
             goto lingering_close;
         }
@@ -1730,16 +1710,15 @@ static apr_status_t event_resume_suspended (conn_rec *c)
     c->suspended_baton = NULL;
 
     cs->pub.sense = CONN_SENSE_DEFAULT;
-    if (!CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state)) {
+    if (cs->pub.state != CONN_STATE_LINGER) {
         cs->pub.state = CONN_STATE_WRITE_COMPLETION;
         if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q, NULL)) {
             return APR_SUCCESS; /* queued */
         }
 
         /* fall through lingering close on error */
-        apr_table_setn(cs->c->notes, "short-lingering-close", "1");
+        cs->pub.state = CONN_STATE_LINGER;
     }
-    cs->pub.state = CONN_STATE_LINGER;
     process_lingering_close(cs);
     return APR_SUCCESS;
 }
@@ -2205,7 +2184,7 @@ static void process_lingering_close(event_conn_state_t *cs)
     ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
                   "lingering close for connection %" CS_FMT,
                   CS_ARG(cs));
-    AP_DEBUG_ASSERT(CONN_STATE_IS_LINGERING_CLOSE(cs->pub.state));
+    AP_DEBUG_ASSERT(cs->pub.state == CONN_STATE_LINGER);
 
     /* Flush and shutdown first */
     if (!cs->linger_shutdown) {
@@ -2216,7 +2195,6 @@ static void process_lingering_close(event_conn_state_t *cs)
 
         if (!cs->linger_started) {
             cs->linger_started = 1; /* once! */
-            apr_atomic_inc32(&lingering_count);
             notify_suspend(cs);
 
             /* Shutdown the connection, i.e. pre_connection_close hooks,
@@ -2259,18 +2237,6 @@ static void process_lingering_close(event_conn_state_t *cs)
         /* All nonblocking from now, no need for APR_INCOMPLETE_READ either */
         apr_socket_timeout_set(csd, 0);
         apr_socket_opt_set(csd, APR_INCOMPLETE_READ, 0);
-
-        /*
-         * If some module requested a shortened waiting period, only wait for
-         * 2s (SECONDS_TO_LINGER). This is useful for mitigating certain
-         * DoS attacks.
-         */
-        if (apr_table_get(cs->c->notes, "short-lingering-close")) {
-            cs->pub.state = CONN_STATE_LINGER_SHORT;
-        }
-        else {
-            cs->pub.state = CONN_STATE_LINGER_NORMAL;
-        }
     }
 
     /* Drain until EAGAIN or EOF/error, in the former case requeue and
@@ -2280,14 +2246,12 @@ static void process_lingering_close(event_conn_state_t *cs)
         apr_size_t nbytes = sizeof(dummybuf);
         rv = apr_socket_recv(csd, dummybuf, &nbytes);
     } while (rv == APR_SUCCESS);
-    if (APR_STATUS_IS_EAGAIN(rv) && !listensocks_disabled()) {
-        struct timeout_queue *q;
-        q = (cs->pub.state == CONN_STATE_LINGER_SHORT) ? short_linger_q : linger_q;
-        if (pollset_add(cs, CONN_SENSE_WANT_READ, q, NULL)) {
-            return; /* queued */
-        }
+
+    if (!APR_STATUS_IS_EAGAIN(rv)
+        || listensocks_disabled() /* busy enough */
+        || !pollset_add(cs, CONN_SENSE_WANT_READ, linger_q, NULL)) {
+        close_connection(cs);
     }
-    close_connection(cs);
 }
 
 /* Call shutdown_connection() for the elements of 'q' that timed out, or
@@ -2437,22 +2401,20 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             }
         }
 
-        if (APLOGtrace6(ap_server_conf)) {
-            /* trace log status every second */
-            if (now - last_log > apr_time_from_sec(1)) {
-                ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf,
-                             "connections: %u (waitio:%u write:%u keepalive:%u "
-                             "lingering:%u suspended:%u), workers: %u/%u shutdown",
-                             apr_atomic_read32(&connection_count),
-                             apr_atomic_read32(waitio_q->total),
-                             apr_atomic_read32(write_completion_q->total),
-                             apr_atomic_read32(keepalive_q->total),
-                             apr_atomic_read32(&lingering_count),
-                             apr_atomic_read32(&suspended_count),
-                             apr_atomic_read32(&threads_shutdown),
-                             threads_per_child);
-                last_log = now;
-            }
+        /* trace log status every second */
+        if (APLOGtrace6(ap_server_conf) && now - last_log > apr_time_from_sec(1)) {
+            ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf,
+                         "connections: %u (waitio:%d write:%d keepalive:%d "
+                         "lingering:%d suspended:%u), workers: %u/%u shutdown",
+                         apr_atomic_read32(&connection_count),
+                         apr_atomic_read32(waitio_q->total),
+                         apr_atomic_read32(write_completion_q->total),
+                         apr_atomic_read32(keepalive_q->total),
+                         apr_atomic_read32(linger_q->total),
+                         apr_atomic_read32(&suspended_count),
+                         apr_atomic_read32(&threads_shutdown),
+                         threads_per_child);
+            last_log = now;
         }
 
 #if HAVE_SERF
@@ -2608,8 +2570,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                 case CONN_STATE_ASYNC_WAITIO:
                     cs->pub.state = CONN_STATE_PROCESSING;
                 case CONN_STATE_WRITE_COMPLETION:
-                case CONN_STATE_LINGER_NORMAL:
-                case CONN_STATE_LINGER_SHORT:
+                case CONN_STATE_LINGER:
                     break;
 
                 default:
@@ -2779,14 +2740,12 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             process_timeout_queue(write_completion_q, now);
             process_timeout_queue(keepalive_q, now);
 
-            /* The linger queues can be shrinked any time under pressure */
+            /* The linger queue can be shrinked any time under pressure */
             if (workers_were_busy || dying) {
                 shrink_timeout_queue(linger_q, now);
-                shrink_timeout_queue(short_linger_q, now);
             }
             else {
                 process_timeout_queue(linger_q, now);
-                process_timeout_queue(short_linger_q, now);
             }
 
             /* Connections in backlog race with the workers (dequeuing) under
@@ -2809,17 +2768,15 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             ps->write_completion = apr_atomic_read32(write_completion_q->total);
             ps->keep_alive = apr_atomic_read32(keepalive_q->total);
             ps->shutdown = apr_atomic_read32(shutdown_q->total);
-            ps->lingering_close = apr_atomic_read32(&lingering_count);
+            ps->lingering_close = apr_atomic_read32(linger_q->total);
             ps->backlog = apr_atomic_read32(backlog_q->total);
             ps->suspended = apr_atomic_read32(&suspended_count);
             ps->connections = apr_atomic_read32(&connection_count);
         }
         else if ((workers_were_busy || dying)
-                 && (apr_atomic_read32(linger_q->total)
-                     || apr_atomic_read32(short_linger_q->total))) {
+                 && apr_atomic_read32(linger_q->total)) {
             apr_thread_mutex_lock(timeout_mutex);
             shrink_timeout_queue(linger_q, now);
-            shrink_timeout_queue(short_linger_q, now);
             apr_thread_mutex_unlock(timeout_mutex);
         }
     } /* listener main loop */
@@ -4494,10 +4451,7 @@ static int event_post_config(apr_pool_t *pconf, apr_pool_t *plog,
     sh_h = apr_hash_make(ptemp);
     bl_h = apr_hash_make(ptemp);
 
-    linger_q = TO_QUEUE_MAKE(pconf, "linger",
-                             apr_time_from_sec(MAX_SECS_TO_LINGER), NULL);
-    short_linger_q = TO_QUEUE_MAKE(pconf, "short_linger",
-                                   apr_time_from_sec(SECONDS_TO_LINGER), NULL);
+    linger_q = TO_QUEUE_MAKE(pconf, "linger", LINGER_READ_TIMEOUT, NULL);
 
     for (; s; s = s->next) {
         event_srv_cfg *sc = apr_pcalloc(pconf, sizeof *sc);

From c82d67ad99dd9f725135a8a0e5d8bf87b9a9b2d8 Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Mon, 26 Jun 2023 21:55:25 +0200
Subject: [PATCH 14/22] mpm_event: Periodic linger queue shrink (500ms).

---
 server/mpm/event/event.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index 0058ba20994..2d33613c41f 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -158,6 +158,9 @@
 /* Lingering close (read) timeout */
 #define LINGER_READ_TIMEOUT     apr_time_from_sec(2)
 
+/* Shrink linger_q at this period (min) when busy */
+#define QUEUES_SHRINK_TIMEOUT   apr_time_from_msec(500)
+
 /* Don't wait more time in poll() if APR_POLLSET_WAKEABLE is not implemented */
 #define NON_WAKEABLE_TIMEOUT    apr_time_from_msec(100)
 
@@ -2348,7 +2351,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
     proc_info *ti = dummy;
     int process_slot = ti->pslot;
     process_score *ps = ap_get_scoreboard_process(process_slot);
-    apr_time_t last_log;
+    apr_time_t last_log, next_shrink_time = 0;
 
     last_log = event_time_now();
     free(ti);
@@ -2743,6 +2746,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             /* The linger queue can be shrinked any time under pressure */
             if (workers_were_busy || dying) {
                 shrink_timeout_queue(linger_q, now);
+                next_shrink_time = now + QUEUES_SHRINK_TIMEOUT;
             }
             else {
                 process_timeout_queue(linger_q, now);
@@ -2773,11 +2777,13 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             ps->suspended = apr_atomic_read32(&suspended_count);
             ps->connections = apr_atomic_read32(&connection_count);
         }
-        else if ((workers_were_busy || dying)
+        else if (next_shrink_time <= now
+                 && (workers_were_busy || dying)
                  && apr_atomic_read32(linger_q->total)) {
             apr_thread_mutex_lock(timeout_mutex);
             shrink_timeout_queue(linger_q, now);
             apr_thread_mutex_unlock(timeout_mutex);
+            next_shrink_time = now + QUEUES_SHRINK_TIMEOUT;
         }
     } /* listener main loop */
 

From eddf29957cfe27fb6243a16767d1508fa1295254 Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Mon, 26 Jun 2023 20:05:33 +0200
Subject: [PATCH 15/22] mpm_event: Use atomic reads/writes for shared
 resources.

---
 server/mpm/event/event.c | 123 ++++++++++++++++++++++++---------------
 1 file changed, 75 insertions(+), 48 deletions(-)

diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index 2d33613c41f..37e6f1b63fd 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -197,15 +197,16 @@ static int server_limit = 0;                /* ServerLimit */
 static int thread_limit = 0;                /* ThreadLimit */
 static int conns_this_child = 0;            /* MaxConnectionsPerChild, only accessed
                                                in listener thread */
-static volatile int dying = 0;
-static volatile int workers_may_exit = 0;
-static volatile int start_thread_may_exit = 0;
-static volatile int listener_may_exit = 0;
-static apr_uint32_t connection_count = 0;   /* Number of open connections */
-static apr_uint32_t timers_count = 0;       /* Number of queued timers */
-static apr_uint32_t suspended_count = 0;    /* Number of suspended connections */
-static apr_uint32_t threads_shutdown = 0;   /* Number of threads that have shutdown
-                                               early during graceful termination */
+static /*atomic*/ apr_uint32_t dying = 0;
+static /*atomic*/ apr_uint32_t workers_may_exit = 0;
+static /*atomic*/ apr_uint32_t start_thread_may_exit = 0;
+static /*atomic*/ apr_uint32_t listener_may_exit = 0;
+static /*atomic*/ apr_uint32_t connection_count = 0; /* Number of open connections */
+static /*atomic*/ apr_uint32_t timers_count = 0;     /* Number of queued timers */
+static /*atomic*/ apr_uint32_t suspended_count = 0;  /* Number of suspended connections */
+static /*atomic*/ apr_uint32_t threads_shutdown = 0; /* Number of threads that have shutdown
+                                                        early during graceful termination */
+
 static int had_healthy_child = 0;
 static int resource_shortage = 0;
 
@@ -481,9 +482,14 @@ static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *cs)
     cs->q = q;
     cs->queue_timestamp = event_time_now();
     APR_RING_INSERT_TAIL(&q->head, cs, event_conn_state_t, timeout_list);
-    ++*q->total;
     ++q->count;
 
+    /* Use atomic_set to be ordered/consistent with potential atomic reads
+     * outside the critical section, but writes are protected so a more
+     * expensive atomic_inc is not needed.
+     */
+    apr_atomic_set32(q->total, *q->total + 1);
+
     /* Cheaply update the global queues_next_expiry with the one of the
      * first entry of this queue (oldest) if it expires before.
      */
@@ -506,8 +512,13 @@ static void TO_QUEUE_REMOVE(struct timeout_queue *q, event_conn_state_t *cs)
 
     APR_RING_REMOVE(cs, timeout_list);
     APR_RING_ELEM_INIT(cs, timeout_list);
-    --*q->total;
     --q->count;
+
+    /* Use atomic_set to be ordered/consistent with potential atomic reads
+     * outside the critical section, but writes are protected so a more
+     * expensive atomic_dec is not needed.
+     */
+    apr_atomic_set32(q->total, *q->total - 1);
 }
 
 static struct timeout_queue *TO_QUEUE_MAKE(apr_pool_t *p,
@@ -717,6 +728,7 @@ static /*atomic*/ apr_uint32_t listensocks_off = 0;
 
 static int disable_listensocks(void)
 {
+    volatile process_score *ps;
     int i;
 
     if (apr_atomic_cas32(&listensocks_off, 1, 0) != 0) {
@@ -738,7 +750,8 @@ static int disable_listensocks(void)
                  apr_atomic_read32(&timers_count),
                  apr_atomic_read32(&suspended_count));
 
-    ap_scoreboard_image->parent[ap_child_slot].not_accepting = 1;
+    ps = &ap_scoreboard_image->parent[ap_child_slot];
+    ps->not_accepting = 1;
 
     for (i = 0; i < num_listensocks; i++) {
         apr_pollset_remove(event_pollset, &listener_pollfd[i]);
@@ -748,9 +761,10 @@ static int disable_listensocks(void)
 
 static int enable_listensocks(void)
 {
+    volatile process_score *ps;
     int i;
 
-    if (listener_may_exit
+    if (apr_atomic_read32(&dying)
         || apr_atomic_cas32(&listensocks_off, 0, 1) != 1) {
         return 0;
     }
@@ -774,7 +788,8 @@ static int enable_listensocks(void)
      * XXX: This is not yet optimal. If many workers suddenly become available,
      * XXX: the parent may kill some processes off too soon.
      */
-    ap_scoreboard_image->parent[ap_child_slot].not_accepting = 0;
+    ps = &ap_scoreboard_image->parent[ap_child_slot];
+    ps->not_accepting = 0;
 
     for (i = 0; i < num_listensocks; i++) {
         apr_pollset_add(event_pollset, &listener_pollfd[i]);
@@ -809,7 +824,9 @@ static APR_INLINE int connections_above_limit(int *busy)
 
 static APR_INLINE int should_enable_listensocks(void)
 {
-    return !dying && listensocks_disabled() && !connections_above_limit(NULL);
+    return (listensocks_disabled()
+            && !apr_atomic_read32(&dying)
+            && !connections_above_limit(NULL));
 }
 
 static void close_socket_at(apr_socket_t *csd,
@@ -855,10 +872,9 @@ static void shutdown_listener(void)
 {
     ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf,
                  "shutting down listener%s",
-                 listener_may_exit ? " again" : "");
+                 apr_atomic_read32(&listener_may_exit) ? " again" : "");
 
-    listener_may_exit = 1;
-    disable_listensocks();
+    apr_atomic_set32(&listener_may_exit, 1);
 
     /* Unblock the listener if it's poll()ing */
     if (event_pollset && listener_is_wakeable) {
@@ -914,7 +930,7 @@ static void signal_threads(int mode)
      * workers to exit once it has stopped accepting new connections
      */
     if (mode == ST_UNGRACEFUL) {
-        workers_may_exit = 1;
+        apr_atomic_set32(&workers_may_exit, 1);
         ap_queue_interrupt_all(worker_queue);
         close_worker_sockets(); /* forcefully kill all current connections */
     }
@@ -993,7 +1009,7 @@ static int event_query(int query_code, int *result, apr_status_t *rv)
 static void event_note_child_stopped(int slot, pid_t pid, ap_generation_t gen)
 {
     if (slot != -1) { /* child had a scoreboard slot? */
-        process_score *ps = &ap_scoreboard_image->parent[slot];
+        volatile process_score *ps = &ap_scoreboard_image->parent[slot];
         int i;
 
         pid = ps->pid;
@@ -1079,8 +1095,9 @@ static int child_fatal;
 
 static apr_status_t decrement_connection_count(void *cs_)
 {
-    int is_last_connection;
     event_conn_state_t *cs = cs_;
+    int is_last_connection, is_dying;
+
     ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf,
                  "connection %" CS_FMT_TO " cleaned up",
                  CS_ARG_TO(cs));
@@ -1097,12 +1114,13 @@ static apr_status_t decrement_connection_count(void *cs_)
      * now accept new connections.
      */
     is_last_connection = !apr_atomic_dec32(&connection_count);
+    is_dying = apr_atomic_read32(&dying);
     if (listener_is_wakeable
-        && ((is_last_connection && listener_may_exit)
+        && ((is_last_connection && is_dying)
             || should_enable_listensocks())) {
         apr_pollset_wakeup(event_pollset);
     }
-    if (dying) {
+    if (is_dying) {
         /* Help worker_thread_should_exit_early() */
         ap_queue_interrupt_one(worker_queue);
     }
@@ -1325,7 +1343,7 @@ static int pollset_add_at(event_conn_state_t *cs, int sense,
         }
 
         /* close_worker_sockets() may have closed it already */
-        if (workers_may_exit) {
+        if (apr_atomic_read32(&workers_may_exit)) {
             AP_DEBUG_ASSERT(APR_STATUS_IS_EBADF(rv));
         }
         else {
@@ -1742,10 +1760,14 @@ static void check_infinite_requests(void)
 
 static void set_child_dying(void)
 {
+    volatile process_score *ps;
+
     ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, "quiescing");
+    ps = &ap_scoreboard_image->parent[ap_child_slot];
+    ps->quiescing = 1;
 
-    dying = 1;
-    ap_scoreboard_image->parent[ap_child_slot].quiescing = 1;
+    apr_atomic_set32(&dying, 1);
+    disable_listensocks(); /* definitively with dying = 1 */
     ap_close_listeners_ex(my_bucket->listeners);
 
 #if 0
@@ -2340,7 +2362,7 @@ static APR_INLINE void shrink_timeout_queue(struct timeout_queue *queue,
     if (count) {
         ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf,
                      "All workers are %s, %s queue shrinked (%u done, %u left)",
-                     dying ? "dying" : "busy", queue->name,
+                     apr_atomic_read32(&dying) ? "dying" : "busy", queue->name,
                      count, apr_atomic_read32(queue->total));
     }
 }
@@ -2384,8 +2406,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
 
         now = poll_time = event_time_now();
 
-        if (listener_may_exit) {
-            int once = !dying;
+        if (apr_atomic_read32(&listener_may_exit)) {
+            int once = !apr_atomic_read32(&dying);
             if (once) {
                 set_child_dying();
             }
@@ -2519,7 +2541,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                      timers_next_expiry ? timers_next_expiry - now : 0,
                      listensocks_disabled() ? "no" : "yes",
                      apr_atomic_read32(&connection_count),
-                     listener_may_exit, dying);
+                     apr_atomic_read32(&listener_may_exit),
+                     apr_atomic_read32(&dying));
 
         rc = apr_pollset_poll(event_pollset, timeout, &num, &out_pfd);
         if (rc != APR_SUCCESS) {
@@ -2554,7 +2577,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                      timers_next_expiry ? timers_next_expiry - now : 0,
                      listensocks_disabled() ? "no" : "yes",
                      apr_atomic_read32(&connection_count),
-                     listener_may_exit, dying);
+                     apr_atomic_read32(&listener_may_exit),
+                     apr_atomic_read32(&dying));
 
         for (user_chain = NULL; num > 0; --num, ++out_pfd) {
             listener_poll_type *pt = out_pfd->client_data;
@@ -2601,7 +2625,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                      */
                     continue;
                 }
-                if (!dying) {
+                if (!apr_atomic_read32(&dying)) {
                     void *csd = NULL;
                     ap_listen_rec *lr = (ap_listen_rec *) pt->baton;
                     apr_pool_t *ptrans;         /* Pool for per-transaction stuff */
@@ -2744,7 +2768,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             process_timeout_queue(keepalive_q, now);
 
             /* The linger queue can be shrinked any time under pressure */
-            if (workers_were_busy || dying) {
+            if (workers_were_busy || apr_atomic_read32(&dying)) {
                 shrink_timeout_queue(linger_q, now);
                 next_shrink_time = now + QUEUES_SHRINK_TIMEOUT;
             }
@@ -2778,7 +2802,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             ps->connections = apr_atomic_read32(&connection_count);
         }
         else if (next_shrink_time <= now
-                 && (workers_were_busy || dying)
+                 && (workers_were_busy || apr_atomic_read32(&dying))
                  && apr_atomic_read32(linger_q->total)) {
             apr_thread_mutex_lock(timeout_mutex);
             shrink_timeout_queue(linger_q, now);
@@ -2870,17 +2894,18 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy)
         }
 
         ap_update_child_status_from_indexes(process_slot, thread_slot,
-                                            dying ? SERVER_GRACEFUL
-                                                  : SERVER_READY,
+                                            (apr_atomic_read32(&dying)
+                                             ? SERVER_GRACEFUL : SERVER_READY),
                                             NULL);
 
-        if (workers_may_exit) {
+        if (apr_atomic_read32(&workers_may_exit)) {
             ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
                          "worker thread %i/%i may exit",
                          thread_slot, threads_per_child);
             break;
         }
-        if (dying && worker_thread_should_exit_early(thread_slot)) {
+        if (apr_atomic_read32(&dying)
+            && worker_thread_should_exit_early(thread_slot)) {
             break;
         }
 
@@ -2907,7 +2932,7 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy)
              * may have already been cleaned up.  Don't log the "error" if
              * workers_may_exit is set.
              */
-            if (!APR_STATUS_IS_EINTR(rv) && !workers_may_exit) {
+            if (!APR_STATUS_IS_EINTR(rv) && !apr_atomic_read32(&workers_may_exit)) {
                 ap_log_error(APLOG_MARK, APLOG_CRIT, rv, ap_server_conf,
                              APLOGNO(03099) "ap_queue_pop_event failed");
                 AP_DEBUG_ASSERT(0);
@@ -2966,8 +2991,8 @@ static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy)
     }
 
     ap_update_child_status_from_indexes(process_slot, thread_slot,
-                                        dying ? SERVER_DEAD
-                                              : SERVER_GRACEFUL,
+                                        (apr_atomic_read32(&dying)
+                                         ? SERVER_DEAD : SERVER_GRACEFUL),
                                         NULL);
 
     apr_thread_exit(thd, APR_SUCCESS);
@@ -3240,7 +3265,8 @@ static void *APR_THREAD_FUNC start_threads(apr_thread_t * thd, void *dummy)
         }
 
 
-        if (start_thread_may_exit || threads_created == threads_per_child) {
+        if (apr_atomic_read32(&start_thread_may_exit)
+            || threads_created == threads_per_child) {
             break;
         }
         /* wait for previous generation to clean up an entry */
@@ -3290,9 +3316,9 @@ static void join_workers(apr_thread_t * listener, apr_thread_t ** threads)
          */
 
         iter = 0;
-        while (!dying) {
+        while (!apr_atomic_read32(&dying)) {
             apr_sleep(apr_time_from_msec(500));
-            if (dying || ++iter > 10) {
+            if (apr_atomic_read32(&dying) || ++iter > 10) {
                 break;
             }
             /* listener has not stopped accepting yet */
@@ -3332,10 +3358,11 @@ static void join_start_thread(apr_thread_t * start_thread_id)
 {
     apr_status_t rv, thread_rv;
 
-    start_thread_may_exit = 1;  /* tell it to give up in case it is still
-                                 * trying to take over slots from a
-                                 * previous generation
-                                 */
+    /* tell it to give up in case it is still trying to take over slots
+     * from a previous generation
+     */
+    apr_atomic_set32(&start_thread_may_exit, 1);
+
     rv = apr_thread_join(&thread_rv, start_thread_id);
     if (rv != APR_SUCCESS) {
         ap_log_error(APLOG_MARK, APLOG_CRIT, rv, ap_server_conf, APLOGNO(00478)

From 143a83e09ba4496cfe6e41e4eb003ce17a76ce2d Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Mon, 3 Jun 2024 16:47:50 +0200
Subject: [PATCH 16/22] mpm_event: Periodic scoreboard stats update (1s).

---
 server/mpm/event/event.c | 115 +++++++++++++++++++++++++++------------
 1 file changed, 79 insertions(+), 36 deletions(-)

diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index 37e6f1b63fd..7141c46ce87 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -161,6 +161,9 @@
 /* Shrink linger_q at this period (min) when busy */
 #define QUEUES_SHRINK_TIMEOUT   apr_time_from_msec(500)
 
+/* Update scoreboard stats at this period */
+#define STATS_UPDATE_TIMEOUT    apr_time_from_msec(1000)
+
 /* Don't wait more time in poll() if APR_POLLSET_WAKEABLE is not implemented */
 #define NON_WAKEABLE_TIMEOUT    apr_time_from_msec(100)
 
@@ -2367,15 +2370,53 @@ static APR_INLINE void shrink_timeout_queue(struct timeout_queue *queue,
     }
 }
 
+static void update_stats(process_score *ps, apr_time_t now,
+                         apr_time_t *when, int force)
+{
+    int expired = (*when <= now);
+
+    if (expired || force) {
+        apr_atomic_set32(&ps->wait_io, apr_atomic_read32(waitio_q->total));
+        apr_atomic_set32(&ps->write_completion, apr_atomic_read32(write_completion_q->total));
+        apr_atomic_set32(&ps->keep_alive, apr_atomic_read32(keepalive_q->total));
+        apr_atomic_set32(&ps->shutdown, apr_atomic_read32(shutdown_q->total));
+        apr_atomic_set32(&ps->lingering_close, apr_atomic_read32(linger_q->total));
+        apr_atomic_set32(&ps->backlog, apr_atomic_read32(backlog_q->total));
+        apr_atomic_set32(&ps->suspended, apr_atomic_read32(&suspended_count));
+        apr_atomic_set32(&ps->connections, apr_atomic_read32(&connection_count));
+    }
+
+    if (expired) {
+        ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf,
+                     "child: idlers:%i conns:%u backlog:%u "
+                     "waitio:%u write:%u keepalive:%u shutdown:%u linger:%u "
+                     "timers:%u suspended:%u (%u/%u workers shutdown)",
+                     ap_queue_info_idlers_count(worker_queue_info),
+                     apr_atomic_read32(&connection_count),
+                     apr_atomic_read32(backlog_q->total),
+                     apr_atomic_read32(waitio_q->total),
+                     apr_atomic_read32(write_completion_q->total),
+                     apr_atomic_read32(keepalive_q->total),
+                     apr_atomic_read32(shutdown_q->total),
+                     apr_atomic_read32(linger_q->total),
+                     apr_atomic_read32(&timers_count),
+                     apr_atomic_read32(&suspended_count),
+                     apr_atomic_read32(&threads_shutdown),
+                     threads_per_child);
+
+        *when = now + STATS_UPDATE_TIMEOUT;
+    }
+}
+
 static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
 {
     apr_status_t rc;
     proc_info *ti = dummy;
     int process_slot = ti->pslot;
     process_score *ps = ap_get_scoreboard_process(process_slot);
-    apr_time_t last_log, next_shrink_time = 0;
+    apr_time_t next_stats_time = 0, next_shrink_time = 0;
+    apr_interval_time_t min_poll_timeout = -1;
 
-    last_log = event_time_now();
     free(ti);
 
 #if HAVE_SERF
@@ -2388,11 +2429,21 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
     apr_signal(LISTENER_SIGNAL, dummy_signal_handler);
     unblock_signal(LISTENER_SIGNAL);
 
+    /* Don't wait in poll() for more than NON_WAKEABLE_TIMEOUT if the pollset
+     * is not wakeable, and not more then the stats update period either.
+     */
+    if (!listener_is_wakeable) {
+        min_poll_timeout = NON_WAKEABLE_TIMEOUT;
+    }
+    if (min_poll_timeout < 0 || min_poll_timeout > STATS_UPDATE_TIMEOUT) {
+        min_poll_timeout = STATS_UPDATE_TIMEOUT;
+    }
+
     for (;;) {
         apr_int32_t num = 0;
         apr_time_t next_expiry = -1;
         apr_interval_time_t timeout = -1;
-        int workers_were_busy = 0;
+        int workers_were_busy = 0, force_stats = 0;
         socket_callback_baton_t *user_chain;
         const apr_pollfd_t *out_pfd;
         apr_time_t now, poll_time;
@@ -2426,22 +2477,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             }
         }
 
-        /* trace log status every second */
-        if (APLOGtrace6(ap_server_conf) && now - last_log > apr_time_from_sec(1)) {
-            ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf,
-                         "connections: %u (waitio:%d write:%d keepalive:%d "
-                         "lingering:%d suspended:%u), workers: %u/%u shutdown",
-                         apr_atomic_read32(&connection_count),
-                         apr_atomic_read32(waitio_q->total),
-                         apr_atomic_read32(write_completion_q->total),
-                         apr_atomic_read32(keepalive_q->total),
-                         apr_atomic_read32(linger_q->total),
-                         apr_atomic_read32(&suspended_count),
-                         apr_atomic_read32(&threads_shutdown),
-                         threads_per_child);
-            last_log = now;
-        }
-
 #if HAVE_SERF
         rc = serf_context_prerun(g_serf);
         if (rc != APR_SUCCESS) {
@@ -2512,15 +2547,32 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             timeout = next_expiry > now ? next_expiry - now : 0;
         }
 
-        /* When non-wakeable, don't wait more than 100 ms, in any case. */
-        if (!listener_is_wakeable && (timeout < 0 || timeout > NON_WAKEABLE_TIMEOUT)) {
-            timeout = NON_WAKEABLE_TIMEOUT;
+        /* So long as there are connections, wake up at most every
+         * min_poll_timeout to refresh the scoreboard stats.
+         */
+        if (timeout < 0 || timeout > min_poll_timeout) {
+            if (timeout > 0
+                || !listener_is_wakeable
+                || apr_atomic_read32(&connection_count)) {
+                timeout = next_stats_time - now;
+                if (timeout <= 0 || timeout > min_poll_timeout) {
+                    timeout = min_poll_timeout;
+                }
+            }
+            else {
+                /* No connections and entering infinite poll(),
+                 * clear the stats first.
+                 */
+                force_stats = 1;
+            }
         }
-        else if (timeout > 0) {
-            /* apr_pollset_poll() might round down the timeout to
-             * milliseconds, let's forcibly round up here to never
-             * return before the timeout.
-             */
+        update_stats(ps, now, &next_stats_time, force_stats);
+
+        /* apr_pollset_poll() might round down the timeout to
+         * milliseconds, let's forcibly round up here to never
+         * return before the timeout.
+         */
+        if (timeout > 0) {
             timeout = apr_time_from_msec(
                 apr_time_as_msec(timeout + apr_time_from_msec(1) - 1)
             );
@@ -2791,15 +2843,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             ap_log_error(APLOG_MARK, APLOG_TRACE7, 0, ap_server_conf,
                          "queues maintained: next timeout=%" APR_TIME_T_FMT,
                          next_expiry ? next_expiry - now : -1);
-
-            ps->wait_io = apr_atomic_read32(waitio_q->total);
-            ps->write_completion = apr_atomic_read32(write_completion_q->total);
-            ps->keep_alive = apr_atomic_read32(keepalive_q->total);
-            ps->shutdown = apr_atomic_read32(shutdown_q->total);
-            ps->lingering_close = apr_atomic_read32(linger_q->total);
-            ps->backlog = apr_atomic_read32(backlog_q->total);
-            ps->suspended = apr_atomic_read32(&suspended_count);
-            ps->connections = apr_atomic_read32(&connection_count);
         }
         else if (next_shrink_time <= now
                  && (workers_were_busy || apr_atomic_read32(&dying))

From fccc1622e5b321815e57c7304cdee27f736f4211 Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Tue, 27 Jun 2023 05:33:34 +0200
Subject: [PATCH 17/22] mpm_event: Autotuning from MaxRequestWorkers.

---
 server/mpm/event/event.c | 494 ++++++++++++++++++++++++++-------------
 1 file changed, 329 insertions(+), 165 deletions(-)

diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index 7141c46ce87..3007dc8b33b 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -149,6 +149,21 @@
 #define MAX_THREAD_LIMIT 100000
 #endif
 
+#ifndef DEFAULT_ASYNC_FACTOR
+#define DEFAULT_ASYNC_FACTOR 2.0
+#endif
+
+#ifndef MAX_SPARE_THREADS_RATIO
+#define MAX_SPARE_THREADS_RATIO 0.75 /* of MaxRequestWorkers */
+#endif
+#ifndef MAX_DAEMONS_THREADS_RATIO
+#define MAX_DAEMONS_THREADS_RATIO 32
+#endif
+
+#ifndef SCOREBOARD_DAEMONS_FACTOR
+#define SCOREBOARD_DAEMONS_FACTOR 4
+#endif
+
 #define MPM_CHILD_PID(i) (ap_scoreboard_image->parent[i].pid)
 
 #if !APR_VERSION_AT_LEAST(1,4,0)
@@ -183,13 +198,6 @@
  * Actual definitions of config globals
  */
 
-#ifndef DEFAULT_WORKER_FACTOR
-#define DEFAULT_WORKER_FACTOR 2
-#endif
-#define WORKER_FACTOR_SCALE   16  /* scale factor to allow fractional values */
-static unsigned int worker_factor = DEFAULT_WORKER_FACTOR * WORKER_FACTOR_SCALE;
-    /* AsyncRequestWorkerFactor * 16 */
-
 static int threads_per_child = 0;           /* ThreadsPerChild */
 static int ap_daemons_to_start = 0;         /* StartServers */
 static int min_spare_threads = 0;           /* MinSpareThreads */
@@ -200,6 +208,12 @@ static int server_limit = 0;                /* ServerLimit */
 static int thread_limit = 0;                /* ThreadLimit */
 static int conns_this_child = 0;            /* MaxConnectionsPerChild, only accessed
                                                in listener thread */
+static double async_factor = DEFAULT_ASYNC_FACTOR; /* AsyncRequestWorkerFactor */
+
+static int auto_settings = 0;               /* Auto settings based on max_workers
+                                               and num_online_cpus */
+static int num_online_cpus = 0;             /* Number of CPUs detected */
+
 static /*atomic*/ apr_uint32_t dying = 0;
 static /*atomic*/ apr_uint32_t workers_may_exit = 0;
 static /*atomic*/ apr_uint32_t start_thread_may_exit = 0;
@@ -627,11 +641,16 @@ typedef struct event_retained_data {
     apr_pool_t *gen_pool; /* generation pool (children start->stop lifetime) */
     event_child_bucket *buckets; /* children buckets (reset per generation) */
 
+    ap_listen_rec **listen_buckets;
+    int num_listen_buckets;
+
     int first_server_limit;
     int first_thread_limit;
+    int first_server_sb_limit;
     int sick_child_detected;
     int maxclients_reported;
     int near_maxclients_reported;
+
     /*
      * The max child slot ever assigned, preserved across restarts.  Necessary
      * to deal with MaxRequestWorkers changes across AP_SIG_GRACEFUL restarts.
@@ -815,7 +834,7 @@ static APR_INLINE int connections_above_limit(int *busy)
                 /* Off by 'listensocks_disabled()' to avoid flip flop */
                 || c_count - l_count < (apr_uint32_t)threads_per_child +
                                        (i_count - listensocks_disabled()) *
-                                       (worker_factor / WORKER_FACTOR_SCALE)) {
+                                       async_factor) {
             return 0;
         }
     }
@@ -3082,11 +3101,12 @@ static void setup_threads_runtime(void)
     const int good_methods[] = { APR_POLLSET_PORT,
                                  APR_POLLSET_KQUEUE,
                                  APR_POLLSET_EPOLL };
-    /* XXX: K-A or lingering close connection included in the async factor */
-    unsigned int async_factor = (worker_factor < WORKER_FACTOR_SCALE * 2
-                                 ? WORKER_FACTOR_SCALE * 2 : worker_factor);
-    unsigned int async_threads = (threads_per_child * async_factor / WORKER_FACTOR_SCALE);
-    const apr_size_t pollset_size = (num_listensocks + async_threads + POLLSET_RESERVE_SIZE);
+    const double threads_factor = (async_factor < DEFAULT_ASYNC_FACTOR
+                                   ? DEFAULT_ASYNC_FACTOR
+                                   : async_factor);
+    const apr_size_t pollset_size = ((unsigned int)(threads_per_child * threads_factor) +
+                                     (unsigned int)num_listensocks +
+                                     POLLSET_RESERVE_SIZE);
     int pollset_flags;
 
     /* Event's skiplist operations will happen concurrently with other modules'
@@ -4063,76 +4083,27 @@ static void server_main_loop(int remaining_children_to_start)
 
 static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
 {
-    ap_listen_rec **listen_buckets = NULL;
-    int num_buckets = retained->mpm->num_buckets;
     int remaining_children_to_start;
+    int num_buckets, i;
     apr_status_t rv;
-    int i;
 
     ap_log_pid(pconf, ap_pid_fname);
 
-    /* On first startup create gen_pool to satisfy the lifetime of the
-     * parent's PODs and listeners; on restart stop the children from the
-     * previous generation and clear gen_pool for the next one.
-     */
-    if (!retained->gen_pool) {
-        apr_pool_create(&retained->gen_pool, ap_pglobal);
-    }
-    else {
-        if (retained->mpm->was_graceful) {
-            /* wake up the children...time to die.  But we'll have more soon */
-            for (i = 0; i < num_buckets; i++) {
-                ap_mpm_podx_killpg(retained->buckets[i].pod,
-                                   active_daemons_limit, AP_MPM_PODX_GRACEFUL);
-            }
-        }
-        else {
-            /* Kill 'em all.  Since the child acts the same on the parents SIGTERM
-             * and a SIGHUP, we may as well use the same signal, because some user
-             * pthreads are stealing signals from us left and right.
-             */
-            for (i = 0; i < num_buckets; i++) {
-                ap_mpm_podx_killpg(retained->buckets[i].pod,
-                                   active_daemons_limit, AP_MPM_PODX_RESTART);
-            }
-            ap_reclaim_child_processes(1,  /* Start with SIGTERM */
-                                       event_note_child_stopped);
-        }
-        apr_pool_clear(retained->gen_pool);
-        retained->buckets = NULL;
-
-        /* advance to the next generation */
-        /* XXX: we really need to make sure this new generation number isn't in
-         * use by any of the previous children.
-         */
-        ++retained->mpm->my_generation;
-    }
-
-    /* On graceful restart, preserve the scoreboard and the listeners buckets.
-     * When ungraceful, clear the scoreboard and set num_buckets to zero to let
-     * ap_duplicate_listeners() below determine how many are needed/configured.
-     */
-    if (!retained->mpm->was_graceful) {
-        if (ap_run_pre_mpm(s->process->pool, SB_SHARED) != OK) {
-            retained->mpm->mpm_state = AP_MPMQ_STOPPING;
-            return !OK;
-        }
-        num_buckets = (one_process) ? 1 : 0; /* one_process => one bucket */
-        retained->mpm->num_buckets = 0; /* reset idle_spawn_rate below */
+    /* Preserve the scoreboard on graceful restart, reset when ungraceful */
+    if (!retained->mpm->was_graceful
+        && ap_run_pre_mpm(s->process->pool, SB_SHARED)) {
+        retained->mpm->mpm_state = AP_MPMQ_STOPPING;
+        return !OK;
     }
 
     /* Now on for the new generation. */
     ap_scoreboard_image->global->running_generation = retained->mpm->my_generation;
     ap_unixd_mpm_set_signals(pconf, one_process);
 
-    if ((rv = ap_duplicate_listeners(retained->gen_pool, ap_server_conf,
-                                     &listen_buckets, &num_buckets))) {
-        ap_log_error(APLOG_MARK, APLOG_CRIT, rv,
-                     ap_server_conf, APLOGNO(03273)
-                     "could not duplicate listeners");
-        return !OK;
-    }
-
+    /* Set the buckets listeners from the listen_buckets initialized
+     * in event_open_logs().
+     */
+    num_buckets = retained->num_listen_buckets;
     retained->buckets = apr_pcalloc(retained->gen_pool,
                                     num_buckets * sizeof(event_child_bucket));
     for (i = 0; i < num_buckets; i++) {
@@ -4144,8 +4115,11 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
                          "could not open pipe-of-death");
             return !OK;
         }
-        retained->buckets[i].listeners = listen_buckets[i];
+        retained->buckets[i].listeners = retained->listen_buckets[i];
     }
+    /* Reset for the next generation/restart */
+    retained->listen_buckets = NULL;
+    retained->num_listen_buckets = 0;
 
     /* If num_buckets changed, adjust max_spawn_rate and the free_slots buffer */
     if (retained->mpm->num_buckets != num_buckets) {
@@ -4178,23 +4152,14 @@ static int event_run(apr_pool_t * _pconf, apr_pool_t * plog, server_rec * s)
         retained->mpm->num_buckets = num_buckets;
     }
 
-    /* Don't thrash since num_buckets depends on the
-     * system and the number of online CPU cores...
-     */
-    if (active_daemons_limit < num_buckets)
-        active_daemons_limit = num_buckets;
-    if (ap_daemons_to_start < num_buckets)
-        ap_daemons_to_start = num_buckets;
-    /* We want to create as much children at a time as the number of buckets,
-     * so to optimally accept connections (evenly distributed across buckets).
-     * Thus min_spare_threads should at least maintain num_buckets children,
-     * and max_spare_threads allow num_buckets more children w/o triggering
-     * immediately (e.g. num_buckets idle threads margin, one per bucket).
-     */
-    if (min_spare_threads < threads_per_child * (num_buckets - 1) + num_buckets)
-        min_spare_threads = threads_per_child * (num_buckets - 1) + num_buckets;
-    if (max_spare_threads < min_spare_threads + (threads_per_child + 1) * num_buckets)
-        max_spare_threads = min_spare_threads + (threads_per_child + 1) * num_buckets;
+    ap_log_error(APLOG_MARK, APLOG_INFO, 0, ap_server_conf, APLOGNO(10464)
+                 "MPM event settings%s: MaxRequestWorkers=%d AsyncRequestWorkerFactor=%.1lf "
+                 "ThreadsPerChild=%d ThreadLimit=%d MinSpareThreads=%d MaxSpareThreads=%d "
+                 "ServerLimit=%d/%d StartServers=%d Buckets=%d CPUs=%d",
+                 auto_settings ? " (auto)" : "", max_workers, async_factor,
+                 threads_per_child, thread_limit, min_spare_threads, max_spare_threads,
+                 active_daemons_limit, server_limit, ap_daemons_to_start,
+                 num_buckets, num_online_cpus);
 
     /* If we're doing a graceful_restart then we're going to see a lot
      * of children exiting immediately when we get into the main loop
@@ -4382,12 +4347,18 @@ static int event_protocol_switch(conn_rec *c, request_rec *r, server_rec *s,
 
 /* This really should be a post_config hook, but the error log is already
  * redirected by that point, so we need to do this in the open_logs phase.
+ * We compute num_buckets here too, thus the definitive AP_MPMQ_* settings
+ * which need it and which may be needed by the post_config hooks of other
+ * modules.
  */
 static int event_open_logs(apr_pool_t * p, apr_pool_t * plog,
                            apr_pool_t * ptemp, server_rec * s)
 {
     int startup = 0;
     int level_flags = 0;
+    int num_buckets = 0, i;
+    int min_threads;
+    apr_status_t rv;
 
     pconf = p;
 
@@ -4408,6 +4379,152 @@ static int event_open_logs(apr_pool_t * p, apr_pool_t * plog,
         return !OK;
     }
 
+    /* On first startup create gen_pool to satisfy the lifetime of the
+     * parent's PODs and listeners; on restart stop the children from the
+     * previous generation and clear gen_pool for the next one.
+     */
+    if (!retained->gen_pool) {
+        apr_pool_create(&retained->gen_pool, ap_pglobal);
+    }
+    else {
+        num_buckets = retained->mpm->num_buckets;
+        if (retained->mpm->was_graceful) {
+            /* wake up the children...time to die.  But we'll have more soon */
+            for (i = 0; i < num_buckets; i++) {
+                ap_mpm_podx_killpg(retained->buckets[i].pod,
+                                   active_daemons_limit, AP_MPM_PODX_GRACEFUL);
+            }
+        }
+        else {
+            /* Kill 'em all.  Since the child acts the same on the parents SIGTERM
+             * and a SIGHUP, we may as well use the same signal, because some user
+             * pthreads are stealing signals from us left and right.
+             */
+            for (i = 0; i < num_buckets; i++) {
+                ap_mpm_podx_killpg(retained->buckets[i].pod,
+                                   active_daemons_limit, AP_MPM_PODX_RESTART);
+            }
+            ap_reclaim_child_processes(1,  /* Start with SIGTERM */
+                                       event_note_child_stopped);
+        }
+        apr_pool_clear(retained->gen_pool);
+        retained->buckets = NULL;
+
+        /* advance to the next generation */
+        /* XXX: we really need to make sure this new generation number isn't in
+         * use by any of the previous children.
+         */
+        ++retained->mpm->my_generation;
+    }
+
+    /* On graceful restart, preserve the listeners buckets. When ungraceful,
+     * set num_buckets to zero to let ap_duplicate_listeners() below determine
+     * how many are needed/configured.
+     */
+    if (!retained->mpm->was_graceful) {
+        num_buckets = (one_process) ? 1 : 0; /* one_process => one bucket */
+        retained->mpm->num_buckets = 0; /* old gen's until event_run() */
+    }
+    if ((rv = ap_duplicate_listeners(retained->gen_pool, ap_server_conf,
+                                     &retained->listen_buckets,
+                                     &num_buckets))) {
+        ap_log_error(APLOG_MARK, APLOG_ALERT | level_flags, rv,
+                     (startup ? NULL : s), APLOGNO(03273)
+                     "could not duplicate listeners, shutting down");
+        return !OK;
+    }
+    retained->num_listen_buckets = num_buckets;
+
+    /* Don't thrash since num_buckets depends on the system and the
+     * number of CPU cores, so make the settings consistent.
+     */
+    if (retained->first_thread_limit) {
+        if (threads_per_child > retained->first_thread_limit) {
+            ap_log_error(APLOG_MARK, APLOG_WARNING, 0, s, APLOGNO(10465)
+                         "ThreadsPerChild (%d) exceeds initial ThreadLimit, "
+                         "forcing ThreadsPerChild to %d",
+                         threads_per_child, retained->first_thread_limit);
+            threads_per_child = retained->first_thread_limit;
+        }
+    }
+    else {
+        if (thread_limit < threads_per_child) {
+            thread_limit = threads_per_child;
+        }
+        retained->first_thread_limit = thread_limit;
+    }
+    min_threads = threads_per_child * num_buckets;
+    if (max_workers < min_threads) {
+        max_workers = min_threads;
+    }
+    else {
+        max_workers = (max_workers / min_threads) * min_threads;
+    }
+    active_daemons_limit = max_workers / threads_per_child;
+    if (retained->first_server_limit) {
+        if (active_daemons_limit > retained->first_server_sb_limit) {
+            int new_max_workers = retained->first_server_sb_limit * threads_per_child;
+            if (new_max_workers < min_threads) {
+                new_max_workers = min_threads;
+            }
+            else {
+                new_max_workers = (new_max_workers / min_threads) * min_threads;
+            }
+            ap_log_error(APLOG_MARK, APLOG_WARNING, 0, s, APLOGNO(10466)
+                         "MaxRequestWorkers (%d) / ThreadsPerChild (%d) would "
+                         "exceed initial scoreboard limit (%d), forcing "
+                         "MaxRequestWorkers to %d",
+                         max_workers, threads_per_child,
+                         retained->first_server_sb_limit,
+                         new_max_workers);
+            max_workers = new_max_workers;
+            active_daemons_limit = retained->first_server_sb_limit;
+        }
+        server_limit = retained->first_server_sb_limit;
+    }
+    else {
+        /* Save the initial ServerLimit which cannot be changed on restart, but
+         * leave some spare room in the actual server_[sb_]limit (used to size
+         * the scoreboard) to allow for children restarting while the old gen
+         * is gracefully exiting.
+         */
+        retained->first_server_limit = server_limit;
+        if (server_limit < active_daemons_limit * SCOREBOARD_DAEMONS_FACTOR) {
+            server_limit = active_daemons_limit * SCOREBOARD_DAEMONS_FACTOR;
+        }
+        retained->first_server_sb_limit = server_limit;
+    }
+    if (ap_daemons_to_start < num_buckets) {
+        ap_daemons_to_start = num_buckets;
+    }
+    else if (ap_daemons_to_start < active_daemons_limit) {
+        ap_daemons_to_start = (ap_daemons_to_start / num_buckets) * num_buckets;
+    }
+    else {
+        ap_daemons_to_start = active_daemons_limit;
+    }
+    if (min_spare_threads < ap_daemons_to_start * threads_per_child) {
+        min_spare_threads = ap_daemons_to_start * threads_per_child;
+    }
+    else if (min_spare_threads < max_workers) {
+        min_spare_threads = (min_spare_threads / min_threads) * min_threads;
+    }
+    else {
+        min_spare_threads = max_workers;
+    }
+    if (max_spare_threads < 0) { /* auto settings */
+        max_spare_threads = max_workers * MAX_SPARE_THREADS_RATIO;
+    }
+    if (max_spare_threads < min_spare_threads + min_threads) {
+        max_spare_threads = min_spare_threads + min_threads;
+    }
+    else if (max_spare_threads < max_workers) {
+        max_spare_threads = (max_spare_threads / min_threads) * min_threads;
+    }
+    else {
+        max_spare_threads = max_workers;
+    }
+
     return OK;
 }
 
@@ -4465,7 +4582,8 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog,
 
     /* sigh, want this only the second time around */
     if (retained->mpm->module_loads == 2) {
-        rv = apr_pollset_create(&event_pollset, 1, plog,
+        apr_pollset_t *tmp = NULL;
+        rv = apr_pollset_create(&tmp, 1, plog,
                                 APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY);
         if (rv != APR_SUCCESS) {
             ap_log_error(APLOG_MARK, APLOG_CRIT, rv, NULL, APLOGNO(00495)
@@ -4474,7 +4592,7 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog,
                          "Also check system or user limits!");
             return HTTP_INTERNAL_SERVER_ERROR;
         }
-        apr_pollset_destroy(event_pollset);
+        apr_pollset_destroy(tmp);
 
         if (!one_process && !foreground) {
             /* before we detach, setup crash handlers to log to errorlog */
@@ -4492,21 +4610,25 @@ static int event_pre_config(apr_pool_t * pconf, apr_pool_t * plog,
     parent_pid = ap_my_pid = getpid();
 
     ap_listen_pre_config();
-    ap_daemons_to_start = DEFAULT_START_DAEMON;
-    min_spare_threads = DEFAULT_MIN_FREE_DAEMON * DEFAULT_THREADS_PER_CHILD;
-    max_spare_threads = DEFAULT_MAX_FREE_DAEMON * DEFAULT_THREADS_PER_CHILD;
-    server_limit = DEFAULT_SERVER_LIMIT;
-    thread_limit = DEFAULT_THREAD_LIMIT;
-    active_daemons_limit = server_limit;
-    threads_per_child = DEFAULT_THREADS_PER_CHILD;
-    max_workers = active_daemons_limit * threads_per_child;
     had_healthy_child = 0;
     ap_extended_status = 0;
 
-    event_pollset = NULL;
-    worker_queue_info = NULL;
-    listener_os_thread = NULL;
-    listener_is_wakeable = 0;
+    max_workers = -1;
+    threads_per_child = -1;
+    min_spare_threads = max_spare_threads = -1;
+    server_limit = thread_limit = -1;
+    ap_daemons_to_start = -1;
+    auto_settings = 0;
+
+#ifndef _SC_NPROCESSORS_ONLN
+    num_online_cpus = 1;
+#else
+    num_online_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+    if (num_online_cpus < 1) {
+        num_online_cpus = 1;
+    }
+#endif
+    async_factor = DEFAULT_ASYNC_FACTOR;
 
     return OK;
 }
@@ -4563,7 +4685,10 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog,
         startup = 1;
     }
 
-    if (server_limit > MAX_SERVER_LIMIT) {
+    if (server_limit < 0) {
+        server_limit = DEFAULT_SERVER_LIMIT;
+    }
+    else if (server_limit > MAX_SERVER_LIMIT) {
         if (startup) {
             ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00497)
                          "WARNING: ServerLimit of %d exceeds compile-time "
@@ -4577,7 +4702,7 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog,
         }
         server_limit = MAX_SERVER_LIMIT;
     }
-    else if (server_limit < 1) {
+    else if (server_limit == 0) {
         if (startup) {
             ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00499)
                          "WARNING: ServerLimit of %d not allowed, "
@@ -4589,14 +4714,10 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog,
         }
         server_limit = 1;
     }
-
     /* you cannot change ServerLimit across a restart; ignore
      * any such attempts
      */
-    if (!retained->first_server_limit) {
-        retained->first_server_limit = server_limit;
-    }
-    else if (server_limit != retained->first_server_limit) {
+    if (retained->first_server_limit && server_limit != retained->first_server_limit) {
         /* don't need a startup console version here */
         ap_log_error(APLOG_MARK, APLOG_WARNING, 0, s, APLOGNO(00501)
                      "changing ServerLimit to %d from original value of %d "
@@ -4605,7 +4726,10 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog,
         server_limit = retained->first_server_limit;
     }
 
-    if (thread_limit > MAX_THREAD_LIMIT) {
+    if (thread_limit < 0) {
+        thread_limit = DEFAULT_THREAD_LIMIT;
+    }
+    else if (thread_limit > MAX_THREAD_LIMIT) {
         if (startup) {
             ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00502)
                          "WARNING: ThreadLimit of %d exceeds compile-time "
@@ -4619,7 +4743,7 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog,
         }
         thread_limit = MAX_THREAD_LIMIT;
     }
-    else if (thread_limit < 1) {
+    else if (thread_limit == 0) {
         if (startup) {
             ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00504)
                          "WARNING: ThreadLimit of %d not allowed, "
@@ -4631,14 +4755,10 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog,
         }
         thread_limit = 1;
     }
-
     /* you cannot change ThreadLimit across a restart; ignore
      * any such attempts
      */
-    if (!retained->first_thread_limit) {
-        retained->first_thread_limit = thread_limit;
-    }
-    else if (thread_limit != retained->first_thread_limit) {
+    if (retained->first_thread_limit && thread_limit != retained->first_thread_limit) {
         /* don't need a startup console version here */
         ap_log_error(APLOG_MARK, APLOG_WARNING, 0, s, APLOGNO(00506)
                      "changing ThreadLimit to %d from original value of %d "
@@ -4647,7 +4767,41 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog,
         thread_limit = retained->first_thread_limit;
     }
 
-    if (threads_per_child > thread_limit) {
+    /* Auto settings depend on max_workers and num_buckets, the latter being
+     * known in event_open_logs() only. So defer to there (with no warnings
+     * since it's somewhat auto..).
+     */
+    if (auto_settings) {
+        if (max_workers <= 0) {
+            /* This used to warn before auto settings, just take the
+             * default value still but silently.
+             */
+            max_workers = DEFAULT_SERVER_LIMIT * DEFAULT_THREADS_PER_CHILD;
+        }
+        if (threads_per_child <= 0) {
+            /* Default threads_per_child is the number of CPUs  */
+            threads_per_child = num_online_cpus;
+
+            /* With a lot of workers and not so much CPUs to handle them,
+             * spawn more threads to get a reasonable active_daemons_limit
+             * i.e. processes / threads ratio.
+             */
+            while (max_workers / threads_per_child >
+                   threads_per_child * MAX_DAEMONS_THREADS_RATIO) {
+                threads_per_child *= 2;
+            }
+        }
+        return OK; /* => event_open_logs() */
+    }
+
+    /* No auto settings; use the default for anything not set (or set to
+     * some negative value), warn about nonsense values and adjust otherwise.
+     */
+
+    if (threads_per_child < 0) {
+        threads_per_child = DEFAULT_THREADS_PER_CHILD;
+    }
+    else if (threads_per_child > thread_limit) {
         if (startup) {
             ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00507)
                          "WARNING: ThreadsPerChild of %d exceeds ThreadLimit "
@@ -4662,7 +4816,7 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog,
         }
         threads_per_child = thread_limit;
     }
-    else if (threads_per_child < 1) {
+    else if (threads_per_child == 0) {
         if (startup) {
             ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00509)
                          "WARNING: ThreadsPerChild of %d not allowed, "
@@ -4675,7 +4829,10 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog,
         threads_per_child = 1;
     }
 
-    if (max_workers < threads_per_child) {
+    if (max_workers < 0) {
+        max_workers = DEFAULT_SERVER_LIMIT * DEFAULT_THREADS_PER_CHILD;
+    }
+    else if (max_workers < threads_per_child) {
         if (startup) {
             ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00511)
                          "WARNING: MaxRequestWorkers of %d is less than "
@@ -4693,27 +4850,6 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog,
     }
 
     active_daemons_limit = max_workers / threads_per_child;
-
-    if (max_workers % threads_per_child) {
-        int tmp_max_workers = active_daemons_limit * threads_per_child;
-
-        if (startup) {
-            ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00513)
-                         "WARNING: MaxRequestWorkers of %d is not an integer "
-                         "multiple of ThreadsPerChild of %d, decreasing to nearest "
-                         "multiple %d, for a maximum of %d servers.",
-                         max_workers, threads_per_child, tmp_max_workers,
-                         active_daemons_limit);
-        } else {
-            ap_log_error(APLOG_MARK, APLOG_WARNING, 0, s, APLOGNO(00514)
-                         "MaxRequestWorkers of %d is not an integer multiple "
-                         "of ThreadsPerChild of %d, decreasing to nearest "
-                         "multiple %d", max_workers, threads_per_child,
-                         tmp_max_workers);
-        }
-        max_workers = tmp_max_workers;
-    }
-
     if (active_daemons_limit > server_limit) {
         if (startup) {
             ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00515)
@@ -4730,10 +4866,34 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog,
                          server_limit * threads_per_child);
         }
         active_daemons_limit = server_limit;
+        max_workers = active_daemons_limit * threads_per_child;
+    }
+    else if (max_workers % threads_per_child) {
+        int new_max_workers = active_daemons_limit * threads_per_child;
+        if (startup) {
+            ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00513)
+                         "WARNING: MaxRequestWorkers of %d is not an integer "
+                         "multiple of ThreadsPerChild of %d, decreasing to nearest "
+                         "multiple %d, for a maximum of %d servers.",
+                         max_workers, threads_per_child, new_max_workers,
+                         active_daemons_limit);
+        } else {
+            ap_log_error(APLOG_MARK, APLOG_WARNING, 0, s, APLOGNO(00514)
+                         "MaxRequestWorkers of %d is not an integer multiple "
+                         "of ThreadsPerChild of %d, decreasing to nearest "
+                         "multiple %d", max_workers, threads_per_child,
+                         new_max_workers);
+        }
+        max_workers = new_max_workers;
     }
 
-    /* ap_daemons_to_start > active_daemons_limit checked in ap_mpm_run() */
-    if (ap_daemons_to_start < 1) {
+    if (ap_daemons_to_start < 0) {
+        ap_daemons_to_start = DEFAULT_START_DAEMON;
+    }
+    else if (ap_daemons_to_start > active_daemons_limit) {
+        ap_daemons_to_start = active_daemons_limit;
+    }
+    else if (ap_daemons_to_start == 0) {
         if (startup) {
             ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00517)
                          "WARNING: StartServers of %d not allowed, "
@@ -4746,7 +4906,10 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog,
         ap_daemons_to_start = 1;
     }
 
-    if (min_spare_threads < 1) {
+    if (min_spare_threads < 0) {
+        min_spare_threads = DEFAULT_MIN_FREE_DAEMON * DEFAULT_THREADS_PER_CHILD;
+    }
+    else if (min_spare_threads == 0) {
         if (startup) {
             ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00519)
                          "WARNING: MinSpareThreads of %d not allowed, "
@@ -4758,12 +4921,18 @@ static int event_check_config(apr_pool_t *p, apr_pool_t *plog,
                          "MinSpareThreads of %d not allowed, increasing to 1",
                          min_spare_threads);
         }
-        min_spare_threads = 1;
+        min_spare_threads = threads_per_child;
     }
 
-    /* max_spare_threads < min_spare_threads + threads_per_child
-     * checked in ap_mpm_run()
-     */
+    if (max_spare_threads < 0) {
+        max_spare_threads = DEFAULT_MAX_FREE_DAEMON * DEFAULT_THREADS_PER_CHILD;
+    }
+    else {
+        /* max_spare_threads value has never been checked, it's silently
+         * adjusted in event_open_logs() such that max_spare_threads >=
+         * min_spare_threads + threads_per_child.
+         */
+    }
 
     return OK;
 }
@@ -4839,7 +5008,7 @@ static const char *set_max_spare_threads(cmd_parms * cmd, void *dummy,
 }
 
 static const char *set_max_workers(cmd_parms * cmd, void *dummy,
-                                   const char *arg)
+                                   const char *arg, const char *arg2)
 {
     const char *err = ap_check_cmd_context(cmd, GLOBAL_ONLY);
     if (err != NULL) {
@@ -4850,7 +5019,10 @@ static const char *set_max_workers(cmd_parms * cmd, void *dummy,
                      "MaxClients is deprecated, use MaxRequestWorkers "
                      "instead.");
     }
+
     max_workers = atoi(arg);
+    auto_settings = (arg2 && !strcasecmp(arg2, "auto"));
+
     return NULL;
 }
 
@@ -4891,23 +5063,15 @@ static const char *set_thread_limit(cmd_parms * cmd, void *dummy,
 static const char *set_worker_factor(cmd_parms * cmd, void *dummy,
                                      const char *arg)
 {
-    double val;
     char *endptr;
     const char *err = ap_check_cmd_context(cmd, GLOBAL_ONLY);
     if (err != NULL) {
         return err;
     }
 
-    val = strtod(arg, &endptr);
-    if (*endptr)
-        return "error parsing value";
-
-    if (val <= 0)
-        return "AsyncRequestWorkerFactor argument must be a positive number";
-
-    worker_factor = val * WORKER_FACTOR_SCALE;
-    if (worker_factor < WORKER_FACTOR_SCALE) {
-        worker_factor = WORKER_FACTOR_SCALE;
+    async_factor = strtod(arg, &endptr);
+    if (*endptr || async_factor < 1.0) {
+        return "AsyncRequestWorkerFactor must be a rational number greater or equal to 1";
     }
     return NULL;
 }
@@ -4923,10 +5087,10 @@ static const command_rec event_cmds[] = {
                   "Minimum number of idle threads, to handle request spikes"),
     AP_INIT_TAKE1("MaxSpareThreads", set_max_spare_threads, NULL, RSRC_CONF,
                   "Maximum number of idle threads"),
-    AP_INIT_TAKE1("MaxClients", set_max_workers, NULL, RSRC_CONF,
-                  "Deprecated name of MaxRequestWorkers"),
-    AP_INIT_TAKE1("MaxRequestWorkers", set_max_workers, NULL, RSRC_CONF,
-                  "Maximum number of threads alive at the same time"),
+    AP_INIT_TAKE12("MaxClients", set_max_workers, NULL, RSRC_CONF,
+                   "Deprecated name of MaxRequestWorkers"),
+    AP_INIT_TAKE12("MaxRequestWorkers", set_max_workers, NULL, RSRC_CONF,
+                   "Maximum number of threads alive at the same time"),
     AP_INIT_TAKE1("ThreadsPerChild", set_threads_per_child, NULL, RSRC_CONF,
                   "Number of threads each child creates"),
     AP_INIT_TAKE1("ThreadLimit", set_thread_limit, NULL, RSRC_CONF,

From fb8839306b1eb6f8a8633989f2504d7b45696edb Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Wed, 10 Jul 2024 15:10:50 +0200
Subject: [PATCH 18/22] mpm_event: Propose some new connections_above_limit()
 heuristics.

---
 server/mpm/event/event.c | 155 +++++++++++++++++++++++++++++++++++----
 1 file changed, 141 insertions(+), 14 deletions(-)

diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index 3007dc8b33b..e0ba249bbf7 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -214,6 +214,9 @@ static int auto_settings = 0;               /* Auto settings based on max_worker
                                                and num_online_cpus */
 static int num_online_cpus = 0;             /* Number of CPUs detected */
 
+static int workers_backlog_limit = 0;       /* Max number of events in the workers' backlog
+                                               (above which not accepting new connections) */
+
 static /*atomic*/ apr_uint32_t dying = 0;
 static /*atomic*/ apr_uint32_t workers_may_exit = 0;
 static /*atomic*/ apr_uint32_t start_thread_may_exit = 0;
@@ -824,23 +827,119 @@ static APR_INLINE int listensocks_disabled(void)
     return apr_atomic_read32(&listensocks_off) != 0;
 }
 
-static APR_INLINE int connections_above_limit(int *busy)
+/* Choose one of these */
+#define LIMIT_BY_CONNS_TOTAL_VS_IDLERS                           0
+#define LIMIT_BY_BACKLOG_MAYBLOCK_VS_IDLERS                      0
+#define LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_VS_IDLERS            1 /* the winner? */
+#define LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_AND_QUEUES_VS_IDLERS 0
+
+#if LIMIT_BY_BACKLOG_MAYBLOCK_VS_IDLERS \
+    || LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_VS_IDLERS \
+    || LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_AND_QUEUES_VS_IDLERS
+/* The rationale for backlog_nonblock_count is that only connections about
+ * to be processed outside the MPM can make a worker thread block, since we
+ * have no guarantee that modules won't block processing them. The core will
+ * not block processing TLS handshakes or reading the HTTP header for instance,
+ * but once the connections are passed to modules they may block in a handler
+ * reading the body or whatever. Those connections are in CONN_STATE_PROCESSING
+ * state in the backlog, which includes newly accepted connections and the ones
+ * waking up from CONN_STATE_KEEPALIVE and CONN_STATE_ASYNC_WAITIO.
+ * But the processing by/inside MPM event will never block, so fast enough
+ * eventually to consider the connections fully handled by the MPM differently
+ * in connnections_above_limit(), where backlog_nonblock_count can help.
+ */
+static /*atomic*/ apr_uint32_t backlog_nonblock_count;
+#endif
+
+static APR_INLINE int connections_above_limit(void)
 {
-    apr_int32_t i_count = ap_queue_info_idlers_count(worker_queue_info);
-    if (i_count > 0) {
-        apr_uint32_t c_count = apr_atomic_read32(&connection_count);
-        apr_uint32_t l_count = apr_atomic_read32(linger_q->total);
-        if (c_count <= l_count
-                /* Off by 'listensocks_disabled()' to avoid flip flop */
-                || c_count - l_count < (apr_uint32_t)threads_per_child +
-                                       (i_count - listensocks_disabled()) *
-                                       async_factor) {
+    /* Note that idlers >= 0 gives the number of idle workers, idlers < 0 gives
+     * the number of connections in the backlog waiting for an idle worker.
+     */
+    int idlers = ap_queue_info_idlers_count(worker_queue_info);
+
+#if LIMIT_BY_CONNS_TOTAL_VS_IDLERS
+
+    /* Limit reached when the number of connections (excluding the ones in
+     * lingering close) is above the number of idle workers.
+     */
+    if (idlers >= 0) {
+        int conns = (apr_atomic_read32(&connection_count) -
+                     apr_atomic_read32(linger_q->total));
+        AP_DEBUG_ASSERT(conns >= 0);
+        if (idlers >= conns) {
+            return 0;
+        }
+    }
+
+#elif LIMIT_BY_BACKLOG_MAYBLOCK_VS_IDLERS
+
+    /* Limit reached when the number of potentially blocking connections in
+     * the backlog is above the number of idle workers.
+     *
+     * Ignore connections in the backlog with "nonblocking" states by adding
+     * them back.
+     */
+    idlers += apr_atomic_read32(&backlog_nonblock_count);
+    if (idlers >= 0) {
+        return 0;
+    }
+
+#elif LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_VS_IDLERS
+
+    /* Limit reached when the number of potentially blocking connections in
+     * the backlog is above the number of idle workers, or the total number
+     * of connections waiting for a worker in the backlog is above some hard
+     * workers_backlog_limit.
+     */
+    if (idlers >= -workers_backlog_limit) {
+        /* Ignore connections in the backlog with "nonblocking" states by
+         * adding them back.
+         */
+        idlers += apr_atomic_read32(&backlog_nonblock_count);
+        if (idlers >= 0) {
+            return 0;
+        }
+    }
+
+#elif LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_AND_QUEUES_VS_IDLERS
+
+    /* Limit reached when the number of potentially blocking connections in
+     * the backlog *and* the queues is above the number of idle workers, or
+     * the total number of connections waiting for a worker in the backlog
+     * is above some hard workers_backlog_limit.
+     */
+    if (idlers >= -workers_backlog_limit) {
+        /* Ignore connections in the backlog with "nonblocking" states by
+         * adding them back.
+         */
+        idlers += apr_atomic_read32(&backlog_nonblock_count);
+        if (idlers >= (apr_atomic_read32(keepalive_q->total) +
+                       apr_atomic_read32(waitio_q->total))) {
             return 0;
         }
     }
-    else if (busy) {
-        *busy = 1;
+
+#else
+
+    /* Legacy but w/o ignoring the keepalive_q (not shrinked anymore).
+     * Limit reached when the number of conns (besides lingering close ones)
+     * is above some unclear limit (the total number of workers plus the
+     * number of idle workers times the async factor..).
+     */
+    int off = listensocks_disabled(); /* off by disabled() to limit flip flop */
+    if (idlers >= off) {
+        int avail = (threads_per_child + (int)((idlers - off) * async_factor));
+        int conns = (apr_atomic_read32(&connection_count) -
+                     apr_atomic_read32(linger_q->total));
+        AP_DEBUG_ASSERT(conns >= 0);
+        if (avail >= conns) {
+            return 0;
+        }
     }
+
+#endif
+
     return 1;
 }
 
@@ -848,7 +947,7 @@ static APR_INLINE int should_enable_listensocks(void)
 {
     return (listensocks_disabled()
             && !apr_atomic_read32(&dying)
-            && !connections_above_limit(NULL));
+            && !connections_above_limit());
 }
 
 static void close_socket_at(apr_socket_t *csd,
@@ -1888,8 +1987,34 @@ static void conn_state_backlog_cb(void *baton, int pushed)
 
     if (pushed) {
         TO_QUEUE_APPEND(cs->sc->bl_q, cs);
+#if LIMIT_BY_BACKLOG_MAYBLOCK_VS_IDLERS \
+    || LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_VS_IDLERS \
+    || LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_AND_QUEUES_VS_IDLERS
+        if (cs->pub.state != CONN_STATE_PROCESSING) {
+            /* These connections won't block when processed.
+             *
+             * Increment *after* TO_QUEUE_APPEND() to make sure that:
+             *   cs->sc->bl_q->total >= backlog_nonblock_count
+             * always holds.
+             */
+            apr_atomic_inc32(&backlog_nonblock_count);
+        }
+#endif
     }
     else { /* popped */
+#if LIMIT_BY_BACKLOG_MAYBLOCK_VS_IDLERS \
+    || LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_VS_IDLERS \
+    || LIMIT_BY_BACKLOG_TOTAL_AND_MAYBLOCK_AND_QUEUES_VS_IDLERS
+        if (cs->pub.state != CONN_STATE_PROCESSING) {
+            /* These connections won't block when processed.
+             *
+             * Decrement *before* TO_QUEUE_REMOVE() to make sure that:
+             *   cs->sc->bl_q->total >= backlog_nonblock_count
+             * always holds.
+             */
+            apr_atomic_dec32(&backlog_nonblock_count);
+        }
+#endif
         TO_QUEUE_REMOVE(cs->sc->bl_q, cs);
 
         /* not in backlog anymore */
@@ -1932,7 +2057,7 @@ static void push2worker(event_conn_state_t *cs, timer_event_t *te,
          * the situation settles down. The listener and new idling workers will
          * test for should_enable_listensocks() to recover (when suitable).
          */
-        if (connections_above_limit(NULL)) {
+        if (connections_above_limit()) {
             disable_listensocks();
             if (above_limit) {
                 *above_limit = 1;
@@ -4525,6 +4650,8 @@ static int event_open_logs(apr_pool_t * p, apr_pool_t * plog,
         max_spare_threads = max_workers;
     }
 
+    workers_backlog_limit = threads_per_child * async_factor;
+
     return OK;
 }
 

From 94baa05601f6c0ea936ae42c18f2acd923859091 Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Mon, 8 Jul 2024 19:19:22 +0200
Subject: [PATCH 19/22] mod_ssl: Nonblocking/async handshakes in
 CONN_STATE_PROCESSING phase.

If AP_MPMQ_CAN_WAITIO, make mod_ssl perform non blocking TLS handshakes and
go async when it would block.
---
 changes-entries/mod_ssl_async_handshakes.txt |  1 +
 modules/ssl/mod_ssl.c                        | 66 ++++++++++++++++----
 modules/ssl/ssl_engine_io.c                  | 59 ++++++++++++++---
 modules/ssl/ssl_private.h                    |  7 +++
 4 files changed, 112 insertions(+), 21 deletions(-)
 create mode 100644 changes-entries/mod_ssl_async_handshakes.txt

diff --git a/changes-entries/mod_ssl_async_handshakes.txt b/changes-entries/mod_ssl_async_handshakes.txt
new file mode 100644
index 00000000000..e19eeb629de
--- /dev/null
+++ b/changes-entries/mod_ssl_async_handshakes.txt
@@ -0,0 +1 @@
+  *) mod_ssl: Perform non blocking and async TLS handshakes. [Graham Leggett]
diff --git a/modules/ssl/mod_ssl.c b/modules/ssl/mod_ssl.c
index 420ae6b79ac..5cae44a64a8 100644
--- a/modules/ssl/mod_ssl.c
+++ b/modules/ssl/mod_ssl.c
@@ -29,6 +29,7 @@
 #include "util_md5.h"
 #include "util_mutex.h"
 #include "ap_provider.h"
+#include "ap_mpm.h"
 #include "http_config.h"
 
 #include "mod_proxy.h" /* for proxy_hook_section_post_config() */
@@ -40,6 +41,8 @@
 int ssl_running_on_valgrind = 0;
 #endif
 
+static int mpm_can_waitio = 0;
+
 #if HAVE_OPENSSL_INIT_SSL || (OPENSSL_VERSION_NUMBER >= 0x10100000L && \
                               !defined(LIBRESSL_VERSION_NUMBER))
 /* Openssl v1.1+ handles all termination automatically from
@@ -464,6 +467,16 @@ static int ssl_hook_pre_config(apr_pool_t *pconf,
     return OK;
 }
 
+static int ssl_hook_post_config(apr_pool_t *pconf, apr_pool_t *plog,
+                                apr_pool_t *ptemp, server_rec *s)
+{
+    if (ap_mpm_query(AP_MPMQ_CAN_WAITIO, &mpm_can_waitio) != APR_SUCCESS) {
+        mpm_can_waitio = 0;
+    }
+
+    return OK;
+}
+
 static SSLConnRec *ssl_init_connection_ctx(conn_rec *c,
                                            ap_conf_vector_t *per_dir_config,
                                            int reinit)
@@ -692,8 +705,9 @@ static int ssl_hook_pre_connection(conn_rec *c, void *csd)
 static int ssl_hook_process_connection(conn_rec* c)
 {
     SSLConnRec *sslconn = myConnConfig(c);
+    int status = DECLINED;
 
-    if (sslconn && !sslconn->disabled) {
+    if (sslconn && !sslconn->disabled && !sslconn->initialized) {
         /* On an active SSL connection, let the input filters initialize
          * themselves which triggers the handshake, which again triggers
          * all kinds of useful things such as SNI and ALPN.
@@ -701,23 +715,50 @@ static int ssl_hook_process_connection(conn_rec* c)
         apr_bucket_brigade* temp;
         apr_status_t rv;
 
-        temp = apr_brigade_create(c->pool, c->bucket_alloc);
-        rv = ap_get_brigade(c->input_filters, temp,
-                            AP_MODE_INIT, APR_BLOCK_READ, 0);
-        apr_brigade_destroy(temp);
-
-        if (APR_SUCCESS != APR_SUCCESS) {
+        temp = ap_acquire_brigade(c);
+        rv = ap_get_brigade(c->input_filters, temp, AP_MODE_INIT,
+                            mpm_can_waitio ? APR_NONBLOCK_READ : APR_BLOCK_READ,
+                            0);
+        ap_release_brigade(c, temp);
+
+        if (rv == APR_SUCCESS) {
+            /* great news, lets continue */
+            ap_log_cerror(APLOG_MARK, APLOG_DEBUG, 0, c, APLOGNO(10370)
+                          "SSL handshake completed, continuing");
+            sslconn->initialized = 1;
+        }
+        else if (rv == MODSSL_ERROR_HTTP_ON_HTTPS) {
+            /* Plain HTTP spoken on https port, mod_ssl wants to be called
+             * without AP_MODE_INIT.
+             */
+            ap_log_cerror(APLOG_MARK, APLOG_DEBUG, 0, c, APLOGNO(10371)
+                          "SSL handshake with plain HTTP, continuing");
+            sslconn->initialized = 1;
+        }
+        else if (mpm_can_waitio && APR_STATUS_IS_EAGAIN(rv)) {
+            /* Take advantage of an async MPM. If we see an EAGAIN,
+             * loop round and don't block.
+             */
+            ap_log_cerror(APLOG_MARK, APLOG_DEBUG, 0, c, APLOGNO(10372)
+                          "SSL handshake in progress, try again later");
             if (c->cs) {
-                c->cs->state = CONN_STATE_LINGER;
+                c->cs->state = CONN_STATE_ASYNC_WAITIO;
             }
-            ap_log_cerror(APLOG_MARK, APLOG_ERR, rv, c, APLOGNO(10373)
+            status = OK;
+        }
+        else {
+            /* we failed, give up */
+            ap_log_cerror(APLOG_MARK, APLOG_INFO, rv, c, APLOGNO(10373)
                           "SSL handshake was not completed, "
                           "closing connection");
-            return OK;
+            if (c->cs) {
+                c->cs->state = CONN_STATE_LINGER;
+            }
+            status = OK;
         }
     }
-    
-    return DECLINED;
+
+    return status;
 }
 
 /*
@@ -746,6 +787,7 @@ static void ssl_register_hooks(apr_pool_t *p)
     ap_hook_http_scheme   (ssl_hook_http_scheme,   NULL,NULL, APR_HOOK_MIDDLE);
     ap_hook_default_port  (ssl_hook_default_port,  NULL,NULL, APR_HOOK_MIDDLE);
     ap_hook_pre_config    (ssl_hook_pre_config,    NULL,NULL, APR_HOOK_MIDDLE);
+    ap_hook_post_config   (ssl_hook_post_config,   NULL,NULL, APR_HOOK_MIDDLE);
     ap_hook_child_init    (ssl_init_Child,         NULL,NULL, APR_HOOK_MIDDLE);
     ap_hook_post_read_request(ssl_hook_ReadReq, pre_prr,NULL, APR_HOOK_MIDDLE);
     ap_hook_check_access  (ssl_hook_Access,        NULL,NULL, APR_HOOK_MIDDLE,
diff --git a/modules/ssl/ssl_engine_io.c b/modules/ssl/ssl_engine_io.c
index 3a2e841ae02..06ebeac2247 100644
--- a/modules/ssl/ssl_engine_io.c
+++ b/modules/ssl/ssl_engine_io.c
@@ -292,6 +292,7 @@ typedef struct {
 } char_buffer_t;
 
 typedef struct {
+    conn_rec *c;
     SSL *ssl;
     BIO *bio_out;
     ap_filter_t *f;
@@ -730,6 +731,32 @@ static apr_status_t ssl_io_input_read(bio_filter_in_ctx_t *inctx,
                  * (This is usually the case when the client forces an SSL
                  * renegotiation which is handled implicitly by OpenSSL.)
                  */
+                if (inctx->c->cs) {
+                    inctx->c->cs->sense = CONN_SENSE_WANT_READ;
+                }
+                inctx->rc = APR_EAGAIN;
+
+                if (*len > 0) {
+                    inctx->rc = APR_SUCCESS;
+                    break;
+                }
+                if (inctx->block == APR_NONBLOCK_READ) {
+                    break;
+                }
+                continue;  /* Blocking and nothing yet?  Try again. */
+            }
+            if (ssl_err == SSL_ERROR_WANT_WRITE) {
+                /*
+                 * If OpenSSL wants to write during read, and we were
+                 * nonblocking, report as an EAGAIN.  Otherwise loop,
+                 * pulling more data from network filter.
+                 *
+                 * (This is usually the case when the client forces an SSL
+                 * renegotiation which is handled implicitly by OpenSSL.)
+                 */
+                if (inctx->c->cs) {
+                    inctx->c->cs->sense = CONN_SENSE_WANT_WRITE;
+                }
                 inctx->rc = APR_EAGAIN;
 
                 if (*len > 0) {
@@ -895,7 +922,9 @@ static apr_status_t ssl_filter_write(ap_filter_t *f,
              * (This is usually the case when the client forces an SSL
              * renegotiation which is handled implicitly by OpenSSL.)
              */
-            outctx->c->cs->sense = CONN_SENSE_WANT_READ;
+            if (outctx->c->cs) {
+                outctx->c->cs->sense = CONN_SENSE_WANT_READ;
+            }
             outctx->rc = APR_EAGAIN;
             ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, outctx->c,
                           "Want read during nonblocking write");
@@ -950,10 +979,6 @@ static apr_status_t ssl_filter_write(ap_filter_t *f,
                                sizeof(HTTP_ON_HTTPS_PORT) - 1, \
                                alloc)
 
-/* Custom apr_status_t error code, used when a plain HTTP request is
- * received on an SSL port. */
-#define MODSSL_ERROR_HTTP_ON_HTTPS (APR_OS_START_USERERR + 0)
-
 /* Custom apr_status_t error code, used when the proxy cannot
  * establish an outgoing SSL connection. */
 #define MODSSL_ERROR_BAD_GATEWAY (APR_OS_START_USERERR + 1)
@@ -989,7 +1014,7 @@ static apr_status_t ssl_io_filter_error(bio_filter_in_ctx_t *inctx,
             f->c->keepalive = AP_CONN_CLOSE;
             if (is_init) {
                 sslconn->non_ssl_request = NON_SSL_SEND_REQLINE;
-                return AP_FILTER_ERROR;
+                return MODSSL_ERROR_HTTP_ON_HTTPS;
             }
             sslconn->non_ssl_request = NON_SSL_SEND_HDR_SEP;
 
@@ -1424,10 +1449,25 @@ static apr_status_t ssl_io_filter_handshake(ssl_filter_ctx_t *filter_ctx)
         }
         else if (ssl_err == SSL_ERROR_WANT_READ) {
             /*
-             * This is in addition to what was present earlier. It is
-             * borrowed from openssl_state_machine.c [mod_tls].
-             * TBD.
+             * Call us back when ready to read *\/
              */
+            ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, outctx->c,
+                          "Want read during nonblocking accept");
+            if (outctx->c->cs) {
+                outctx->c->cs->sense = CONN_SENSE_WANT_READ;
+            }
+            outctx->rc = APR_EAGAIN;
+            return APR_EAGAIN;
+        }
+        else if (ssl_err == SSL_ERROR_WANT_WRITE) {
+            /*
+             * Call us back when ready to write *\/
+             */
+            ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, outctx->c,
+                          "Want write during nonblocking accept");
+            if (outctx->c->cs) {
+                outctx->c->cs->sense = CONN_SENSE_WANT_WRITE;
+            }
             outctx->rc = APR_EAGAIN;
             return APR_EAGAIN;
         }
@@ -2230,6 +2270,7 @@ static apr_status_t ssl_io_input_add_filter(ssl_filter_ctx_t *filter_ctx, conn_r
     }
     BIO_set_data(filter_ctx->pbioRead, (void *)inctx);
 
+    inctx->c = c;
     inctx->ssl = ssl;
     inctx->bio_out = filter_ctx->pbioWrite;
     inctx->f = filter_ctx->pInputFilter;
diff --git a/modules/ssl/ssl_private.h b/modules/ssl/ssl_private.h
index 2f7bb51fa5a..dc2f4f0d98b 100644
--- a/modules/ssl/ssl_private.h
+++ b/modules/ssl/ssl_private.h
@@ -367,6 +367,12 @@ APLOG_USE_MODULE(ssl);
 #define mySrvConfigFromConn(c) mySrvConfig(mySrvFromConn(c))
 #define myModConfigFromConn(c) myModConfig(mySrvFromConn(c))
 
+/**
+ * Custom apr_status_t error code, used when a plain HTTP request is
+ * received on an SSL port.
+ */
+#define MODSSL_ERROR_HTTP_ON_HTTPS (APR_OS_START_USERERR + 0)
+
 /**
  * Defaults for the configuration
  */
@@ -582,6 +588,7 @@ typedef struct {
     const char *verify_info;
     const char *verify_error;
     int verify_depth;
+    int initialized;
     int disabled;
     enum {
         NON_SSL_OK = 0,        /* is SSL request, or error handling completed */

From 6cbda1f1fa81fc7fdfdeeb8e45619978c1f8950f Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Tue, 9 Jul 2024 11:37:58 +0200
Subject: [PATCH 20/22] core,http: Non blocking HTTP header read.

---
 include/http_protocol.h        |  75 +++-
 include/httpd.h                |   3 +
 include/mod_core.h             |   8 +-
 modules/http/http_core.c       |  43 +-
 modules/http/http_filters.c    | 158 ++++---
 modules/http2/h2_stream.c      |   2 +-
 modules/proxy/mod_proxy_http.c |   6 +-
 server/core.c                  |   2 +-
 server/core_filters.c          |  71 ++--
 server/protocol.c              | 753 +++++++++++++++++----------------
 10 files changed, 647 insertions(+), 474 deletions(-)

diff --git a/include/http_protocol.h b/include/http_protocol.h
index 2b509b341fe..0290abef450 100644
--- a/include/http_protocol.h
+++ b/include/http_protocol.h
@@ -54,19 +54,30 @@ AP_DECLARE_DATA extern ap_filter_rec_t *ap_old_write_func;
  */
 
 /**
- * Read an empty request and set reasonable defaults.
+ * Create an empty request and set reasonable defaults.
  * @param c The current connection
  * @return The new request_rec
  */
 AP_DECLARE(request_rec *) ap_create_request(conn_rec *c);
 
 /**
- * Read a request and fill in the fields.
+ * Read the request line and header fields.
  * @param c The current connection
  * @return The new request_rec
  */
 AP_DECLARE(request_rec *) ap_read_request(conn_rec *c);
 
+/**
+ * Read the request line and header fields, possibly non-blocking.
+ * @param r      The request read
+ * @param c      The connection to read from
+ * @param block  How the read should be performed
+ *               ::APR_BLOCK_READ, ::APR_NONBLOCK_READ
+ * @return APR_SUCCESS, APR_EAGAIN or APR_EGENERAL
+ */
+AP_DECLARE(apr_status_t) ap_read_request_ex(request_rec **r, conn_rec *c,
+                                            apr_read_type_e block);
+
 /**
  * Assign the method, uri and protocol (in HTTP/1.x the
  * items from the first line) to the request.
@@ -107,6 +118,12 @@ AP_DECLARE(int) ap_parse_request_line(request_rec *r);
  */
 AP_DECLARE(int) ap_check_request_header(request_rec *r);
 
+/**
+ * Reentrant state for ap_fgetline_ex() and ap_get_mime_headers_ex()
+ */
+struct ap_getline_state; /* opaque */
+typedef struct ap_getline_state ap_getline_state_t;
+
 /**
  * Read the mime-encoded headers.
  * @param r The current request
@@ -122,6 +139,23 @@ AP_DECLARE(void) ap_get_mime_headers(request_rec *r);
 AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r,
                                           apr_bucket_brigade *bb);
 
+/**
+ * Reentrant version of ap_get_mime_headers() reading from an input
+ * filter in blocking or non-blocking mode.
+ * @param r The current request
+ * @param f Input filter to read from
+ * @param block  How the operations should be performed
+ *               ::APR_BLOCK_READ, ::APR_NONBLOCK_READ
+ * @param bb temp brigade
+ * @param state_p State of the parsing, must point to NULL on first call
+ *        and points to NULL on output if APR_EAGAIN is not returned
+ */
+AP_DECLARE(apr_status_t) ap_get_mime_headers_ex(request_rec *r,
+                                                ap_filter_t *f,
+                                                apr_read_type_e block,
+                                                apr_bucket_brigade *bb,
+                                                ap_getline_state_t **state_p);
+
 /**
  * Run post_read_request hook and validate.
  * @param r The current request
@@ -744,11 +778,13 @@ AP_DECLARE(apr_status_t) ap_get_basic_auth_components(const request_rec *r,
  */
 AP_CORE_DECLARE(void) ap_parse_uri(request_rec *r, const char *uri);
 
-#define AP_GETLINE_FOLD      (1 << 0) /* Whether to merge continuation lines */
-#define AP_GETLINE_CRLF      (1 << 1) /* Whether line ends must be CRLF */
-#define AP_GETLINE_NOSPC_EOL (1 << 2) /* Whether to consume up to and including
-                                         the end of line on APR_ENOSPC */
-#define AP_GETLINE_NONBLOCK  (1 << 3) /* Whether to read non-blocking */
+#define AP_GETLINE_FOLD         (1 << 0) /* Whether to merge continuation lines */
+#define AP_GETLINE_CRLF         (1 << 1) /* Whether line ends must be CRLF */
+#define AP_GETLINE_NOSPC_EOL    (1 << 2) /* Whether to consume up to and including
+                                            the end of line on APR_ENOSPC */
+#define AP_GETLINE_NONBLOCK     (1 << 3) /* Whether to read non-blocking */
+#define AP_GETLINE_ALLOC        (1 << 4) /* Whether to allocate the returned line */
+#define AP_GETLINE_FOLD_COL     (1 << 5 | AP_GETLINE_FOLD) /* Fold after colon only */
 
 /**
  * Get the next line of input for the request
@@ -783,6 +819,31 @@ AP_DECLARE(apr_status_t) ap_fgetline(char **s, apr_size_t n,
                                      int flags, apr_bucket_brigade *bb,
                                      apr_pool_t *p);
 
+/**
+ * Get the next line from an input filter, reentrant (e.g. EAGAIN).
+ *
+ * @param s Pointer to the pointer to the buffer into which the line
+ *          should be read; if *s==NULL, a buffer of the necessary size
+ *          to hold the data will be allocated from \p p
+ * @param n The size of the buffer
+ * @param read The length of the line.
+ * @param f Input filter to read from
+ * @param flags Bit mask of AP_GETLINE_* options
+ * @param bb Working brigade to use when reading buckets
+ * @param state_p State of the parsing, must point to NULL on first call
+ *        and points to NULL on output if APR_EAGAIN is not returned
+ * @param p The pool to allocate the buffer from (if needed)
+ * @return APR_SUCCESS, if successful
+ *         APR_ENOSPC, if the line is too big to fit in the buffer
+ *         APR_EAGAIN, if non-blocking IO would block
+ *         Other errors where appropriate
+ */
+AP_DECLARE(apr_status_t) ap_fgetline_ex(char **s, apr_size_t n,
+                                        apr_size_t *read, ap_filter_t *f,
+                                        int flags, apr_bucket_brigade *bb,
+                                        ap_getline_state_t **state_p,
+                                        apr_pool_t *p);
+
 /**
  * @see ap_fgetline
  *
diff --git a/include/httpd.h b/include/httpd.h
index c3f72fceb7e..ae08740b227 100644
--- a/include/httpd.h
+++ b/include/httpd.h
@@ -1315,6 +1315,9 @@ struct conn_rec {
     int async_filter;
 
     int outgoing;
+
+    /** Partial request being read (non-blocking) */
+    request_rec *partial_request;
 };
 
 struct conn_slave_rec {
diff --git a/include/mod_core.h b/include/mod_core.h
index f9cc0611f4c..b4a40de2d5d 100644
--- a/include/mod_core.h
+++ b/include/mod_core.h
@@ -41,7 +41,7 @@ extern "C" {
 
 /* Handles for core filters */
 AP_DECLARE_DATA extern ap_filter_rec_t *ap_http_input_filter_handle;
-AP_DECLARE_DATA extern ap_filter_rec_t *ap_h1_request_in_filter_handle;
+AP_DECLARE_DATA extern ap_filter_rec_t *ap_h1_header_in_filter_handle;
 AP_DECLARE_DATA extern ap_filter_rec_t *ap_h1_body_in_filter_handle;
 AP_DECLARE_DATA extern ap_filter_rec_t *ap_http_header_filter_handle;
 AP_DECLARE_DATA extern ap_filter_rec_t *ap_chunk_filter_handle;
@@ -55,9 +55,9 @@ apr_status_t ap_http_filter(ap_filter_t *f, apr_bucket_brigade *b,
                             ap_input_mode_t mode, apr_read_type_e block,
                             apr_off_t readbytes);
 
-apr_status_t ap_h1_request_in_filter(ap_filter_t *f, apr_bucket_brigade *bb,
-                                     ap_input_mode_t mode, apr_read_type_e block,
-                                     apr_off_t readbytes);
+apr_status_t ap_h1_header_in_filter(ap_filter_t *f, apr_bucket_brigade *bb,
+                                    ap_input_mode_t mode, apr_read_type_e block,
+                                    apr_off_t readbytes);
 
 apr_status_t ap_h1_body_in_filter(ap_filter_t *f, apr_bucket_brigade *b,
                                      ap_input_mode_t mode, apr_read_type_e block,
diff --git a/modules/http/http_core.c b/modules/http/http_core.c
index 85858ab2b57..7e9f82f87dd 100644
--- a/modules/http/http_core.c
+++ b/modules/http/http_core.c
@@ -37,7 +37,7 @@
 
 /* Handles for core filters */
 AP_DECLARE_DATA ap_filter_rec_t *ap_http_input_filter_handle;
-AP_DECLARE_DATA ap_filter_rec_t *ap_h1_request_in_filter_handle;
+AP_DECLARE_DATA ap_filter_rec_t *ap_h1_header_in_filter_handle;
 AP_DECLARE_DATA ap_filter_rec_t *ap_h1_body_in_filter_handle;
 AP_DECLARE_DATA ap_filter_rec_t *ap_http_header_filter_handle;
 AP_DECLARE_DATA ap_filter_rec_t *ap_h1_response_out_filter_handle;
@@ -50,7 +50,8 @@ AP_DECLARE_DATA const char *ap_multipart_boundary;
 /* If we are using an MPM That Supports Async Connections,
  * use a different processing function
  */
-static int async_mpm = 0;
+static int mpm_is_async = 0;
+static int mpm_can_waitio = 0;
 
 static const char *set_keep_alive_timeout(cmd_parms *cmd, void *dummy,
                                           const char *arg)
@@ -145,18 +146,34 @@ static int ap_process_http_async_connection(conn_rec *c)
     AP_DEBUG_ASSERT(cs->state == CONN_STATE_PROCESSING);
 
     if (cs->state == CONN_STATE_PROCESSING) {
+        apr_read_type_e block = APR_BLOCK_READ;
+        apr_status_t rv;
+
+        /* slave connections (i.e. h2_c2) not ready for WAITIO yet */
+        if (mpm_can_waitio && !c->master) {
+            block = APR_NONBLOCK_READ;
+        }
+
         ap_update_child_status_from_conn(c->sbh, SERVER_BUSY_READ, c);
         if (ap_extended_status) {
             ap_set_conn_count(c->sbh, r, c->keepalives);
         }
-        if ((r = ap_read_request(c))) {
+
+        rv = ap_read_request_ex(&r, c, block);
+        if (APR_STATUS_IS_EAGAIN(rv)) {
+            cs->state = CONN_STATE_ASYNC_WAITIO;
+            return OK;
+        }
+        if (rv == APR_SUCCESS) {
             if (r->status == HTTP_OK) {
                 cs->state = CONN_STATE_HANDLER;
+
                 if (ap_extended_status) {
                     ap_set_conn_count(c->sbh, r, c->keepalives + 1);
                 }
                 ap_update_child_status(c->sbh, SERVER_BUSY_WRITE, r);
                 ap_process_async_request(r);
+
                 /* After the call to ap_process_request, the
                  * request pool may have been deleted.  We set
                  * r=NULL here to ensure that any dereference
@@ -168,7 +185,8 @@ static int ap_process_http_async_connection(conn_rec *c)
             }
 
             if (cs->state != CONN_STATE_WRITE_COMPLETION &&
-                cs->state != CONN_STATE_SUSPENDED) {
+                cs->state != CONN_STATE_SUSPENDED &&
+                cs->state != CONN_STATE_LINGER) {
                 /* Something went wrong; close the connection */
                 cs->state = CONN_STATE_LINGER;
             }
@@ -246,7 +264,7 @@ static int ap_process_http_sync_connection(conn_rec *c)
 
 static int ap_process_http_connection(conn_rec *c)
 {
-    if (async_mpm && !c->clogging_input_filters) {
+    if (mpm_is_async && !c->clogging_input_filters) {
         return ap_process_http_async_connection(c);
     }
     else {
@@ -276,7 +294,7 @@ static void h1_pre_read_request(request_rec *r, conn_rec *c)
     if (!r->main && !r->prev
         && !strcmp(AP_PROTOCOL_HTTP1, ap_get_protocol(c))) {
         if (r->proxyreq == PROXYREQ_NONE) {
-            ap_add_input_filter_handle(ap_h1_request_in_filter_handle,
+            ap_add_input_filter_handle(ap_h1_header_in_filter_handle,
                                        NULL, r, r->connection);
         }
         ap_add_output_filter_handle(ap_h1_response_out_filter_handle,
@@ -343,9 +361,14 @@ static int http_send_options(request_rec *r)
 static int http_post_config(apr_pool_t *p, apr_pool_t *plog, apr_pool_t *ptemp, server_rec *s)
 {
     apr_uint64_t val;
-    if (ap_mpm_query(AP_MPMQ_IS_ASYNC, &async_mpm) != APR_SUCCESS) {
-        async_mpm = 0;
+
+    if (ap_mpm_query(AP_MPMQ_IS_ASYNC, &mpm_is_async) != APR_SUCCESS) {
+        mpm_is_async = 0;
     }
+    if (ap_mpm_query(AP_MPMQ_CAN_WAITIO, &mpm_can_waitio) != APR_SUCCESS) {
+        mpm_can_waitio = 0;
+    }
+
     ap_random_insecure_bytes(&val, sizeof(val));
     ap_multipart_boundary = apr_psprintf(p, "%0" APR_UINT64_T_HEX_FMT, val);
 
@@ -369,8 +392,8 @@ static void register_hooks(apr_pool_t *p)
     ap_http_input_filter_handle =
         ap_register_input_filter("HTTP_IN", ap_http_filter,
                                  NULL, AP_FTYPE_PROTOCOL);
-    ap_h1_request_in_filter_handle =
-        ap_register_input_filter("HTTP1_REQUEST_IN", ap_h1_request_in_filter,
+    ap_h1_header_in_filter_handle =
+        ap_register_input_filter("HTTP1_HEADER_IN", ap_h1_header_in_filter,
                                  NULL, AP_FTYPE_PROTOCOL);
     ap_h1_body_in_filter_handle =
         ap_register_input_filter("HTTP1_BODY_IN", ap_h1_body_in_filter,
diff --git a/modules/http/http_filters.c b/modules/http/http_filters.c
index 426fe2fcb97..d7667c8c361 100644
--- a/modules/http/http_filters.c
+++ b/modules/http/http_filters.c
@@ -264,9 +264,10 @@ static apr_status_t read_chunked_trailers(http_ctx_t *ctx, ap_filter_t *f,
     apr_bucket *e;
     request_rec *r = f->r;
     apr_table_t *trailers;
-    apr_table_t *saved_headers_in = r->headers_in;
+    apr_table_t *saved_headers_in;
     int saved_status = r->status;
 
+    saved_headers_in = r->headers_in;
     trailers = apr_table_make(r->pool, 5);
     r->status = HTTP_OK;
     r->headers_in = trailers;
@@ -2174,18 +2175,34 @@ typedef struct h1_request_ctx {
     const char *method;
     const char *uri;
     const char *protocol;
+
+    /* parsing context */
+    ap_getline_state_t *getline_state;
+    apr_bucket_brigade *tmp_bb;
+    int num_blank_lines;
 } h1_request_ctx;
 
-static apr_status_t read_request_line(h1_request_ctx *ctx, apr_bucket_brigade *bb)
+static apr_status_t read_request_line(h1_request_ctx *ctx,
+                                      ap_filter_t *f, apr_read_type_e block,
+                                      apr_bucket_brigade *bb)
 {
-    apr_size_t len;
-    int num_blank_lines = DEFAULT_LIMIT_BLANK_LINES;
-    core_server_config *conf = ap_get_core_module_config(ctx->r->server->module_config);
+    request_rec *r = ctx->r;
+    apr_size_t max_size = r->server->limit_req_line + 2 + 1; /* + CRLF + \0 */
+    core_server_config *conf = ap_get_core_module_config(r->server->module_config);
     int strict = (conf->http_conformance != AP_HTTP_CONFORMANCE_UNSAFE);
+    int flags = AP_GETLINE_ALLOC;
     apr_status_t rv;
 
+    if (strict) {
+        flags |= AP_GETLINE_CRLF;
+    }
+    if (block == APR_NONBLOCK_READ) {
+        flags |= AP_GETLINE_NONBLOCK;
+    }
+
     /* Read past empty lines until we get a real request line,
      * a read error, the connection closes (EOF), or we timeout.
+     * Reentrance on EAGAIN is handled in/by ctx->getline_state.
      *
      * We skip empty lines because browsers have to tack a CRLF on to the end
      * of POSTs to support old CERN webservers.  But note that we may not
@@ -2199,52 +2216,35 @@ static apr_status_t read_request_line(h1_request_ctx *ctx, apr_bucket_brigade *b
      * have to block during a read.
      */
     do {
-        /* ensure ap_rgetline allocates memory each time thru the loop
-         * if there are empty lines
-         */
-        ctx->request_line = NULL;
-        len = 0;
-        rv = ap_rgetline(&ctx->request_line, (apr_size_t)(ctx->r->server->limit_req_line + 2),
-                         &len, ctx->r, strict ? AP_GETLINE_CRLF : 0, bb);
+        apr_size_t len = 0;
 
+        /* allocates memory each time thru the loop */
+        rv = ap_fgetline_ex(&ctx->request_line, max_size, &len, f, flags,
+                            bb, &ctx->getline_state, r->pool);
         if (rv != APR_SUCCESS) {
             return rv;
         }
-        else if (len > 0) {
-            /* got the line in ctx->r->the_request */
+        if (len > 0) {
+            /* got full line */
             return APR_SUCCESS;
         }
-    } while (--num_blank_lines >= 0);
+    } while (--ctx->num_blank_lines >= 0);
+
     /* too many blank lines */
     return APR_EINVAL;
 }
 
-static void sanitize_brigade(apr_bucket_brigade *bb)
-{
-    apr_bucket *e, *next;
-
-    for (e = APR_BRIGADE_FIRST(bb);
-         e != APR_BRIGADE_SENTINEL(bb);
-         e = next)
-    {
-        next = APR_BUCKET_NEXT(e);
-        if (!APR_BUCKET_IS_METADATA(e) && e->length == 0) {
-            apr_bucket_delete(e);
-        }
-    }
-}
-
-apr_status_t ap_h1_request_in_filter(ap_filter_t *f,
-                                     apr_bucket_brigade *bb,
-                                     ap_input_mode_t mode,
-                                     apr_read_type_e block,
-                                     apr_off_t readbytes)
+apr_status_t ap_h1_header_in_filter(ap_filter_t *f,
+                                    apr_bucket_brigade *bb,
+                                    ap_input_mode_t mode,
+                                    apr_read_type_e block,
+                                    apr_off_t readbytes)
 {
     request_rec *r = f->r;
-    apr_bucket *e;
     h1_request_ctx *ctx = f->ctx;
     apr_status_t rv = APR_SUCCESS;
     int http_status = HTTP_OK;
+    apr_bucket *e;
 
     /* just get out of the way for things we don't want to handle. */
     if (mode != AP_MODE_READBYTES && mode != AP_MODE_GETLINE) {
@@ -2255,15 +2255,23 @@ apr_status_t ap_h1_request_in_filter(ap_filter_t *f,
         f->ctx = ctx = apr_pcalloc(r->pool, sizeof(*ctx));
         ctx->r = r;
         ctx->state = REQ_LINE;
+        ctx->num_blank_lines = DEFAULT_LIMIT_BLANK_LINES;
+        ctx->tmp_bb = apr_brigade_create(r->pool, r->connection->bucket_alloc);
     }
 
-    /* This filter needs to get out of the way of read_request_line() */
-    ap_remove_input_filter(f);
-
-    while (APR_SUCCESS == rv) {
+    for (;;) {
         switch (ctx->state) {
         case REQ_LINE:
-            if ((rv = read_request_line(ctx, bb)) != APR_SUCCESS) {
+            rv = read_request_line(ctx, f->next, block, ctx->tmp_bb);
+            apr_brigade_cleanup(ctx->tmp_bb);
+
+            if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) {
+                ap_log_rerror(APLOG_MARK, APLOG_TRACE6, rv, r,
+                              "reading request line");
+                rv = APR_EAGAIN;
+                goto cleanup;
+            }
+            if (rv != APR_SUCCESS) {
                 /* certain failures are answered with a HTTP error bucket
                  * and are terminal for parsing a request */
                 ctx->method = ctx->uri = "-";
@@ -2280,60 +2288,76 @@ apr_status_t ap_h1_request_in_filter(ap_filter_t *f,
                 else if (APR_STATUS_IS_EINVAL(rv)) {
                     http_status = HTTP_BAD_REQUEST;
                 }
+                ap_log_rerror(APLOG_MARK, APLOG_TRACE1, rv, r,
+                              "failed reading request line (status %d)",
+                              http_status != HTTP_OK ? http_status : -1);
                 goto cleanup;
             }
 
             if (!ap_h1_tokenize_request_line(r, ctx->request_line,
-                                             &ctx->method, &ctx->uri, &ctx->protocol)) {
+                                             &ctx->method, &ctx->uri,
+                                             &ctx->protocol)) {
                 http_status = HTTP_BAD_REQUEST;
+                ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, r,
+                              "failed tokenizing request line, "
+                              "returning error bucket %d",
+                              http_status);
                 goto cleanup;
             }
+
             /* got the request line and it looked to contain what we need */
             ctx->state = REQ_HEADERS;
             break;
 
         case REQ_HEADERS:
-            ap_get_mime_headers_core(r, bb);
-            if (r->status != HTTP_OK) {
-                ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(00567)
-                              "request failed: error reading the headers");
-                http_status = r->status;
+            rv = ap_get_mime_headers_ex(r, f->next, block, ctx->tmp_bb,
+                                        &ctx->getline_state);
+            apr_brigade_cleanup(ctx->tmp_bb);
+
+            if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) {
+                ap_log_rerror(APLOG_MARK, APLOG_TRACE6, rv, r,
+                              "reading request headers");
+                goto cleanup;
+            }
+            if (rv != APR_SUCCESS || r->status != HTTP_OK) {
+                http_status = (r->status == HTTP_OK
+                               ? HTTP_INTERNAL_SERVER_ERROR
+                               : r->status);
+                ap_log_rerror(APLOG_MARK, APLOG_DEBUG, rv, r, APLOGNO(00567)
+                              "request failed: error reading the headers (%i)",
+                              http_status);
+                r->status = HTTP_OK;
                 goto cleanup;
             }
-            /* clear the brigade, as ap_get_mime_headers_core() leaves the last
-             * empty line in there, insert the REQUEST bucket and return */
-            apr_brigade_cleanup(bb);
+
             e = ap_bucket_request_createn(ctx->method, ctx->uri,
                                           ctx->protocol, r->headers_in,
                                           r->pool, r->connection->bucket_alloc);
-            /* reading may leave 0 length data buckets in the brigade,
-             * get rid of those. */
-            sanitize_brigade(bb);
-            APR_BRIGADE_INSERT_HEAD(bb, e);
-            ctx->state = REQ_BODY;
-            ap_log_rerror(APLOG_MARK, APLOG_TRACE1, rv, r,
+            APR_BRIGADE_INSERT_TAIL(bb, e);
+
+            ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, r,
                           "http1 request and headers parsed: %s %s %s",
                           ctx->method, ctx->uri, ctx->protocol);
-            goto cleanup;
-
-        case REQ_BODY:
-            /* we should not come here */
-            AP_DEBUG_ASSERT(0);
-            rv = ap_get_brigade(f->next, bb, mode, block, readbytes);
+            /* Got the header, done with this filter */
+            ap_remove_input_filter(f);
+            ctx->state = REQ_BODY;
             goto cleanup;
 
         case REQ_ERROR:
-        default:
+            ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, r,
+                          "invalid request read while in error");
             rv = APR_EINVAL;
             goto cleanup;
+
+        default:
+            /* we should never come here */
+            ap_assert(0);
+            break;
         }
-    } /* while(APR_SUCCESS == rv) */
+    }
 
 cleanup:
     if (http_status != HTTP_OK) {
-        ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, r,
-                      "failed reading request line, returning error bucket %d", http_status);
-        apr_brigade_cleanup(bb);
         e = ap_bucket_error_create(http_status, NULL, r->pool,
                                    f->c->bucket_alloc);
         APR_BRIGADE_INSERT_TAIL(bb, e);
diff --git a/modules/http2/h2_stream.c b/modules/http2/h2_stream.c
index ee87555f9f3..b050b4d962c 100644
--- a/modules/http2/h2_stream.c
+++ b/modules/http2/h2_stream.c
@@ -755,7 +755,7 @@ apr_status_t h2_stream_add_header(h2_stream *stream,
     }
     
     if (session->s->limit_req_fields > 0 
-        && stream->request_headers_added > session->s->limit_req_fields) {
+        && stream->request_headers_added >= session->s->limit_req_fields) {
         /* already over limit, count this attempt, but do not take it in */
         ++stream->request_headers_added;
     }
diff --git a/modules/proxy/mod_proxy_http.c b/modules/proxy/mod_proxy_http.c
index bfeee868558..38da5b0f7f6 100644
--- a/modules/proxy/mod_proxy_http.c
+++ b/modules/proxy/mod_proxy_http.c
@@ -888,10 +888,8 @@ static apr_status_t ap_proxy_read_headers(request_rec *r, request_rec *rr,
 
     tmp_bb = apr_brigade_create(r->pool, c->bucket_alloc);
     while (1) {
-        rc = ap_proxygetline(tmp_bb, buffer, size, rr,
-                             AP_GETLINE_FOLD | AP_GETLINE_NOSPC_EOL, &len);
-
-
+        const int flags = AP_GETLINE_FOLD_COL;
+        rc = ap_proxygetline(tmp_bb, buffer, size, rr, flags, &len);
         if (rc != APR_SUCCESS) {
             if (APR_STATUS_IS_ENOSPC(rc)) {
                 int trunc = (len > 128 ? 128 : len) / 2;
diff --git a/server/core.c b/server/core.c
index 4d5d569d93b..632af394d8f 100644
--- a/server/core.c
+++ b/server/core.c
@@ -5551,7 +5551,7 @@ static conn_rec *core_create_conn(apr_pool_t *ptrans, server_rec *s,
     c->id = id;
     c->bucket_alloc = alloc;
     c->async_filter = sconf->async_filter;
-
+    c->keepalive = AP_CONN_UNKNOWN;
     c->clogging_input_filters = 0;
 
     if (sconf->conn_log_level) {
diff --git a/server/core_filters.c b/server/core_filters.c
index 0887603b9ab..2dbc5afbb83 100644
--- a/server/core_filters.c
+++ b/server/core_filters.c
@@ -142,13 +142,18 @@ apr_status_t ap_core_input_filter(ap_filter_t *f, apr_bucket_brigade *b,
     if (mode == AP_MODE_GETLINE) {
         /* we are reading a single LF line, e.g. the HTTP headers */
         rv = apr_brigade_split_line(b, ctx->bb, block, HUGE_STRING_LEN);
-        /* We should treat EAGAIN here the same as we do for EOF (brigade is
-         * empty).  We do this by returning whatever we have read.  This may
-         * or may not be bogus, but is consistent (for now) with EOF logic.
+
+        /* To distinguish EAGAIN from EOS (for which apr_brigade_split_line()
+         * returns an empty brigade), return an empty brigade only for the
+         * former and APR_EOF for the latter.
          */
         if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) {
             rv = APR_SUCCESS;
         }
+        else if (rv == APR_SUCCESS && APR_BRIGADE_EMPTY(b)) {
+            AP_DEBUG_ASSERT(APR_BRIGADE_EMPTY(ctx->bb));
+            rv = APR_EOF;
+        }
         goto cleanup;
     }
 
@@ -234,31 +239,43 @@ apr_status_t ap_core_input_filter(ap_filter_t *f, apr_bucket_brigade *b,
 
         AP_DEBUG_ASSERT(readbytes > 0);
 
-        e = APR_BRIGADE_FIRST(ctx->bb);
-        rv = apr_bucket_read(e, &str, &len, block);
-        if (rv != APR_SUCCESS) {
-            if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) {
+        do {
+            e = APR_BRIGADE_FIRST(ctx->bb);
+            rv = apr_bucket_read(e, &str, &len, block);
+            if (rv != APR_SUCCESS) {
                 /* getting EAGAIN for a blocking read is an error; not for a
-                 * non-blocking read, return an empty brigade. */
-                rv = APR_SUCCESS;
+                 * non-blocking read, return an empty brigade w/ APR_SUCCESS */
+                if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) {
+                    rv = APR_SUCCESS;
+                }
+                goto cleanup;
             }
-            goto cleanup;
-        }
-        else if (block == APR_BLOCK_READ && len == 0) {
-            /* We wanted to read some bytes in blocking mode.  We read
-             * 0 bytes.  Hence, we now assume we are EOS.
-             *
-             * When we are in normal mode, return an EOS bucket to the
-             * caller.
-             * When we are in speculative mode, leave ctx->bb empty, so
-             * that the next call returns an EOS bucket.
-             */
-            apr_bucket_delete(e);
+            if (len > 0) {
+                break;
+            }
+            if (APR_BUCKET_IS_METADATA(e)) {
+                APR_BUCKET_REMOVE(e);
+                APR_BRIGADE_INSERT_TAIL(b, e);
+            }
+            else {
+                apr_bucket_delete(e);
+            }
+        } while (!APR_BRIGADE_EMPTY(ctx->bb));
 
-            if (mode == AP_MODE_READBYTES) {
+        if (len == 0) {
+            /* We are at EOS.
+             * In normal blocking mode, return an EOS bucket.
+             * Otherwise it's not expected by the caller, so return APR_EOF
+             * directly.
+             */
+            AP_DEBUG_ASSERT(APR_BRIGADE_EMPTY(ctx->bb));
+            if (mode == AP_MODE_READBYTES && block == APR_BLOCK_READ) {
                 e = apr_bucket_eos_create(c->bucket_alloc);
                 APR_BRIGADE_INSERT_TAIL(b, e);
             }
+            else if (APR_BRIGADE_EMPTY(b)) {
+                rv = APR_EOF;
+            }
             goto cleanup;
         }
 
@@ -266,7 +283,7 @@ apr_status_t ap_core_input_filter(ap_filter_t *f, apr_bucket_brigade *b,
         if (len < readbytes) {
             apr_size_t bucket_len;
 
-            /* We already registered the data in e in len */
+            /* We already accounted for e in len */
             e = APR_BUCKET_NEXT(e);
             while ((len < readbytes) && (rv == APR_SUCCESS)
                    && (e != APR_BRIGADE_SENTINEL(ctx->bb))) {
@@ -290,11 +307,11 @@ apr_status_t ap_core_input_filter(ap_filter_t *f, apr_bucket_brigade *b,
                     }
                 }
             }
-        }
 
-        /* We can only return at most what we read. */
-        if (len < readbytes) {
-            readbytes = len;
+            /* We can only return at most what we read. */
+            if (len < readbytes) {
+                readbytes = len;
+            }
         }
 
         rv = apr_brigade_partition(ctx->bb, readbytes, &e);
diff --git a/server/protocol.c b/server/protocol.c
index 9ac4e3fe929..e0334722cda 100644
--- a/server/protocol.c
+++ b/server/protocol.c
@@ -61,6 +61,10 @@
 #undef APLOG_MODULE_INDEX
 #define APLOG_MODULE_INDEX AP_CORE_MODULE_INDEX
 
+#ifndef AP_ASCII_COLON
+#define AP_ASCII_COLON '\x3a'
+#endif
+
 APR_HOOK_STRUCT(
     APR_HOOK_LINK(pre_read_request)
     APR_HOOK_LINK(post_read_request)
@@ -210,55 +214,66 @@ AP_DECLARE(apr_time_t) ap_rationalize_mtime(request_rec *r, apr_time_t mtime)
  *        If no LF is detected on the last line due to a dropped connection
  *        or a full buffer, that's considered an error.
  */
-static apr_status_t ap_fgetline_core(char **s, apr_size_t n,
-                                     apr_size_t *read, ap_filter_t *f,
-                                     int flags, apr_bucket_brigade *bb,
-                                     apr_pool_t *p)
+enum folding_state_e {
+    NOT_FOLDING = 0,
+    FOLDING_FIND,
+    FOLDING_READ,
+    FOLDING_DONE,
+};
+struct ap_getline_state {
+    char *buf;
+    apr_size_t len;
+    apr_size_t max_size;
+    apr_size_t alloc_size;
+    apr_size_t folding_len;
+    enum folding_state_e folding_state;
+    unsigned int folding_col    :1,
+                 allocate       :1,
+                 reusable       :1;
+};
+static apr_status_t ap_fgetline_core(ap_getline_state_t *state,
+                                     ap_filter_t *f, int flags,
+                                     apr_bucket_brigade *bb,
+                                     apr_pool_t *p,
+                                     int rec)
 {
     apr_status_t rv;
-    apr_bucket *e;
-    apr_size_t bytes_handled = 0, current_alloc = 0;
-    char *pos, *last_char = *s;
-    int do_alloc = (*s == NULL), saw_eos = 0;
+    apr_read_type_e block;
     int fold = flags & AP_GETLINE_FOLD;
     int crlf = flags & AP_GETLINE_CRLF;
+    int do_alloc = (flags & AP_GETLINE_ALLOC) || state->allocate;
     int nospc_eol = flags & AP_GETLINE_NOSPC_EOL;
-    int saw_eol = 0, saw_nospc = 0;
-    apr_read_type_e block;
+    apr_status_t late_rv = APR_SUCCESS;
+    int seen_eol = 0, seen_nospc = 0;
+    apr_bucket *e;
 
-    if (!n) {
+    state->reusable = 0; /* until further notice */
+
+    if (state->max_size == 0) {
         /* Needs room for NUL byte at least */
-        *read = 0;
         return APR_BADARG;
     }
 
     block = (flags & AP_GETLINE_NONBLOCK) ? APR_NONBLOCK_READ
                                           : APR_BLOCK_READ;
 
-    /*
-     * Initialize last_char as otherwise a random value will be compared
-     * against APR_ASCII_LF at the end of the loop if bb only contains
-     * zero-length buckets.
-     */
-    if (last_char)
-        *last_char = '\0';
-
+    if (state->folding_state == FOLDING_FIND) {
+        /* EAGAIN looking up for folding line, continue there */
+        goto find_folding;
+    }
     do {
         apr_brigade_cleanup(bb);
         rv = ap_get_brigade(f, bb, AP_MODE_GETLINE, block, 0);
         if (rv != APR_SUCCESS) {
             goto cleanup;
         }
-
-        /* Something horribly wrong happened.  Someone didn't block! 
-         * (this also happens at the end of each keepalive connection)
-         * (this also happens when non-blocking is asked too, not that wrong)
-         */
         if (APR_BRIGADE_EMPTY(bb)) {
-            if (block != APR_NONBLOCK_READ) {
+            if (block == APR_BLOCK_READ) {
+                /* Something horribly wrong happened.  Someone didn't block! */
                 rv = APR_EGENERAL;
             }
             else {
+                /* Non blocking (which would block) gets us here */
                 rv = APR_EAGAIN;
             }
             goto cleanup;
@@ -271,10 +286,10 @@ static apr_status_t ap_fgetline_core(char **s, apr_size_t n,
             const char *str;
             apr_size_t len;
 
-            /* If we see an EOS, don't bother doing anything more. */
+            /* APR_EOF on EOS (CRLF is missing) */
             if (APR_BUCKET_IS_EOS(e)) {
-                saw_eos = 1;
-                break;
+                rv = APR_EOF;
+                goto cleanup;
             }
 
             rv = apr_bucket_read(e, &str, &len, APR_BLOCK_READ);
@@ -282,6 +297,27 @@ static apr_status_t ap_fgetline_core(char **s, apr_size_t n,
                 goto cleanup;
             }
 
+            /* If folding, trim leading blanks */
+            if (state->folding_state == FOLDING_READ && len > 0) {
+                size_t i;
+                for (i = 0; i < len; ++i) {
+                    const char c = str[i];
+                    if (c != APR_ASCII_BLANK && c != APR_ASCII_TAB) {
+                        break;
+                    }
+                }
+                state->folding_len += i;
+                ap_assert(state->folding_len > 0);
+                str += i;
+                len -= i;
+
+                /* Fail if the line is composed of blanks only */
+                if ((len > 0 && str[0] == APR_ASCII_LF)
+                     || (len > 1 && str[0] == APR_ASCII_CR
+                                 && str[1] == APR_ASCII_LF)) {
+                    late_rv = APR_EINVAL;
+                }
+            }
             if (len == 0) {
                 /* no use attempting a zero-byte alloc (hurts when
                  * using --with-efence --enable-pool-debug) or
@@ -290,11 +326,13 @@ static apr_status_t ap_fgetline_core(char **s, apr_size_t n,
                 continue;
             }
 
-            /* Would this overrun our buffer?  If so, we'll die. */
-            if (n < bytes_handled + len) {
+            /* Would this exceed the limit?  If so, we'll die. */
+            if (state->len + state->folding_len + len >= state->max_size) {
+                apr_size_t read_len = state->len + state->folding_len;
+
                 /* Before we die, let's fill the buffer up to its limit (i.e.
                  * fall through with the remaining length, if any), setting
-                 * saw_eol on LF to stop the outer loop appropriately; we may
+                 * seen_eol on LF to stop the outer loop appropriately; we may
                  * come back here once the buffer is filled (no LF seen), and
                  * either be done at that time or continue to wait for LF here
                  * if nospc_eol is set.
@@ -306,248 +344,299 @@ static apr_status_t ap_fgetline_core(char **s, apr_size_t n,
                  * we have to handle the case so that it's not returned to the
                  * caller as part of the truncated line (it's not!). This is
                  * easier to consider that LF is out of counting and thus fall
-                 * through with no error (saw_eol is set to 2 so that we later
+                 * through with no error (seen_eol is set to 2 so that we later
                  * ignore LF handling already done here), while folding and
                  * nospc_eol logics continue to work (or fail) appropriately.
                  */
-                saw_eol = (str[len - 1] == APR_ASCII_LF);
-                if (/* First time around */
-                    saw_eol && !saw_nospc
-                    /*  Single LF completing the buffered CR, */
-                    && ((len == 1 && ((*s)[bytes_handled - 1] == APR_ASCII_CR))
-                    /*  or trailing CRLF overuns by LF only */
-                        || (len > 1 && str[len - 2] == APR_ASCII_CR
-                            && n - bytes_handled + 1 == len))) {
-                    /* In both cases *last_char is (to be) the CR stripped by
-                     * later 'bytes_handled = last_char - *s'.
-                     */
-                    saw_eol = 2;
+                seen_eol = (str[len - 1] == APR_ASCII_LF);
+                if (!seen_eol
+                    || seen_nospc
+                    || read_len + len != state->max_size) {
+                    /* Some data lost */
+                    late_rv = APR_ENOSPC;
+                    seen_nospc = 1;
+                }
+                else if ((len == 1
+                          && state->len > 0
+                          && state->buf[state->len - 1] == APR_ASCII_CR)
+                         || (len > 1 && str[len - 2] == APR_ASCII_CR)) {
+                    /* CR[LF] is to be stripped */
+                    seen_eol = 2;
                 }
                 else {
-                    /* In any other case we'd lose data. */
-                    rv = APR_ENOSPC;
-                    saw_nospc = 1;
+                    /* Single LF to be stripped (or fail if AP_GETLINE_CRLF) */
+                    AP_DEBUG_ASSERT(seen_eol == 1);
                 }
-                len = n - bytes_handled;
-                if (!len) {
-                    if (saw_eol) {
-                        break;
-                    }
-                    if (nospc_eol) {
-                        continue;
-                    }
-                    goto cleanup;
+
+                if (read_len + 1 >= state->max_size) {
+                    /* Full, check loop condition */
+                    continue;
                 }
+
+                /* Fall through (fill buf up to len) */
+                len = state->max_size - read_len - 1;
             }
 
             /* Do we have to handle the allocation ourselves? */
             if (do_alloc) {
+                apr_size_t more_len = len + (state->folding_state == FOLDING_READ);
+
                 /* We'll assume the common case where one bucket is enough. */
-                if (!*s) {
-                    current_alloc = len;
-                    *s = apr_palloc(p, current_alloc + 1);
+                if (state->buf == NULL) {
+                    state->alloc_size = more_len + 1;
+                    state->buf = apr_palloc(p, state->alloc_size);
                 }
-                else if (bytes_handled + len > current_alloc) {
+                else if (state->len + more_len >= state->alloc_size) {
                     /* Increase the buffer size */
-                    apr_size_t new_size = current_alloc * 2;
+                    apr_size_t new_size;
                     char *new_buffer;
 
-                    if (bytes_handled + len > new_size) {
-                        new_size = (bytes_handled + len) * 2;
+                    if (state->alloc_size >= state->max_size / 2) {
+                        new_size = state->max_size;
                     }
+                    else {
+                        new_size = state->alloc_size * 2;
+                        if (state->len + more_len >= new_size) {
+                            new_size = state->len + more_len + 1;
+                        }
+                    }
+                    ap_assert(new_size > state->len + more_len);
 
-                    new_buffer = apr_palloc(p, new_size + 1);
+                    new_buffer = apr_palloc(p, new_size);
 
                     /* Copy what we already had. */
-                    memcpy(new_buffer, *s, bytes_handled);
-                    current_alloc = new_size;
-                    *s = new_buffer;
+                    memcpy(new_buffer, state->buf, state->len);
+                    state->alloc_size = new_size;
+                    state->buf = new_buffer;
                 }
             }
 
-            /* Just copy the rest of the data to the end of the old buffer. */
-            pos = *s + bytes_handled;
-            memcpy(pos, str, len);
-            last_char = pos + len - 1;
-
-            /* We've now processed that new data - update accordingly. */
-            bytes_handled += len;
+            if (state->folding_state == FOLDING_READ) {
+                /* Replace all blanks with a single one. */
+                state->buf[state->len++] = APR_ASCII_BLANK;
+                state->folding_state = FOLDING_DONE;
+            }
+            /* Just copy new data to the end of the buffer. */
+            memcpy(state->buf + state->len, str, len);
+            state->len += len;
         }
 
         /* If we got a full line of input, stop reading */
-        if (last_char && (*last_char == APR_ASCII_LF)) {
-            saw_eol = 1;
+        if (state->len && state->buf[state->len - 1] == APR_ASCII_LF) {
+            seen_eol = 1;
         }
-    } while (!saw_eol);
+    } while (!seen_eol && (!seen_nospc || nospc_eol));
 
-    if (rv != APR_SUCCESS) {
-        /* End of line after APR_ENOSPC above */
+    if (late_rv != APR_SUCCESS) {
+        rv = late_rv;
+        goto cleanup;
+    }
+    if (state->folding_state == FOLDING_READ) {
+        /* Folding is blank only */
+        rv = APR_EINVAL;
         goto cleanup;
     }
 
     /* Now terminate the string at the end of the line;
      * if the last-but-one character is a CR, terminate there.
-     * LF is handled above (not accounted) when saw_eol == 2,
+     * LF is handled above (not accounted) when seen_eol == 2,
      * the last char is CR to terminate at still.
      */
-    if (saw_eol < 2) {
-        if (last_char > *s && last_char[-1] == APR_ASCII_CR) {
-            last_char--;
+    state->len--;
+    if (seen_eol != 2) {
+        if (state->len && state->buf[state->len - 1] == APR_ASCII_CR) {
+            state->len--;
         }
         else if (crlf) {
             rv = APR_EINVAL;
             goto cleanup;
         }
     }
-    bytes_handled = last_char - *s;
 
-    /* If we're folding, we have more work to do.
+    /* If we have to search for folding, we have more work to do.
+     * If folding already, let the (recursive) caller loop for the next
+     * folding line if any and thus issue terminal recursions only.
      *
-     * Note that if an EOS was seen, we know we can't have another line.
+     * Note that if an empty line or an EOS was seen, we know we can't have
+     * another line.
      */
-    if (fold && bytes_handled && !saw_eos) {
+    if (fold && !state->folding_state && state->len) {
+        state->folding_state = FOLDING_FIND;
+find_folding:
+        flags &= ~AP_GETLINE_FOLD;
         for (;;) {
             const char *str;
             apr_size_t len;
-            char c;
-
-            /* Clear the temp brigade for this filter read. */
-            apr_brigade_cleanup(bb);
+            char c = 0;
 
             /* We only care about the first byte. */
+            apr_brigade_cleanup(bb);
             rv = ap_get_brigade(f, bb, AP_MODE_SPECULATIVE, block, 1);
             if (rv != APR_SUCCESS) {
                 goto cleanup;
             }
-
             if (APR_BRIGADE_EMPTY(bb)) {
+                if (block != APR_NONBLOCK_READ) {
+                    rv = APR_EGENERAL;
+                }
+                else {
+                    rv = APR_EAGAIN;
+                }
                 break;
             }
+            do {
+                e = APR_BRIGADE_FIRST(bb);
 
-            e = APR_BRIGADE_FIRST(bb);
-
-            /* If we see an EOS, don't bother doing anything more. */
-            if (APR_BUCKET_IS_EOS(e)) {
-                break;
-            }
-
-            rv = apr_bucket_read(e, &str, &len, APR_BLOCK_READ);
-            if (rv != APR_SUCCESS) {
-                apr_brigade_cleanup(bb);
-                goto cleanup;
-            }
-
-            /* Found one, so call ourselves again to get the next line.
-             *
-             * FIXME: If the folding line is completely blank, should we
-             * stop folding?  Does that require also looking at the next
-             * char?
-             */
-            /* When we call destroy, the buckets are deleted, so save that
-             * one character we need.  This simplifies our execution paths
-             * at the cost of one character read.
-             */
-            c = *str;
-            if (c == APR_ASCII_BLANK || c == APR_ASCII_TAB) {
-                /* Do we have enough space? We may be full now. */
-                if (bytes_handled >= n) {
-                    rv = APR_ENOSPC;
+                /* APR_EOF on EOS (CRLF is missing) */
+                if (APR_BUCKET_IS_EOS(e)) {
+                    rv = APR_EOF;
                     goto cleanup;
                 }
-                else {
-                    apr_size_t next_size, next_len;
-                    char *tmp;
 
-                    /* If we're doing the allocations for them, we have to
-                     * give ourselves a NULL and copy it on return.
-                     */
-                    if (do_alloc) {
-                        tmp = NULL;
-                    }
-                    else {
-                        tmp = last_char;
-                    }
-
-                    next_size = n - bytes_handled;
-
-                    rv = ap_fgetline_core(&tmp, next_size, &next_len, f,
-                                          flags & ~AP_GETLINE_FOLD, bb, p);
-                    if (rv != APR_SUCCESS) {
-                        goto cleanup;
-                    }
-
-                    if (do_alloc && next_len > 0) {
-                        char *new_buffer;
-                        apr_size_t new_size = bytes_handled + next_len + 1;
-
-                        /* we need to alloc an extra byte for a null */
-                        new_buffer = apr_palloc(p, new_size);
+                rv = apr_bucket_read(e, &str, &len, APR_BLOCK_READ);
+                if (rv != APR_SUCCESS) {
+                    goto cleanup;
+                }
+                if (len > 0) {
+                    c = *str;
+                    break;
+                }
 
-                        /* Copy what we already had. */
-                        memcpy(new_buffer, *s, bytes_handled);
+                apr_bucket_delete(e);
+            } while (!APR_BRIGADE_EMPTY(bb));
 
-                        /* copy the new line, including the trailing null */
-                        memcpy(new_buffer + bytes_handled, tmp, next_len);
-                        *s = new_buffer;
-                    }
+            if (APR_BRIGADE_EMPTY(bb)) {
+                /* No useful data, continue reading */
+                continue;
+            }
+            if (c != APR_ASCII_BLANK && c != APR_ASCII_TAB) {
+                /* Not a continuation line */
+                state->folding_state = NOT_FOLDING;
+                state->folding_col = 0;
+                break;
+            }
 
-                    last_char += next_len;
-                    bytes_handled += next_len;
+            /* Found one, may be allowed after a colon char only */
+            if ((flags & AP_GETLINE_FOLD_COL) && !state->folding_col) {
+                if (!memchr(state->buf, AP_ASCII_COLON, state->len)) {
+                    rv = APR_EINVAL;
+                    goto cleanup;
                 }
+                state->folding_col = 1;
             }
-            else { /* next character is not tab or space */
-                break;
+
+            /* Before folding, trim trailing blanks */
+            while (state->len
+                   && (state->buf[state->len - 1] == APR_ASCII_BLANK
+                       || state->buf[state->len - 1] == APR_ASCII_TAB)) {
+                state->folding_len++;
+                state->len--;
+            }
+
+            /* Call ourselves again to get the next line. */
+            state->folding_state = FOLDING_READ;
+            rv = ap_fgetline_core(state, f, flags, bb, p, 1);
+            if (rv != APR_SUCCESS) {
+                goto cleanup;
             }
+            state->folding_state = FOLDING_FIND;
         }
     }
 
 cleanup:
-    if (bytes_handled >= n) {
-        bytes_handled = n - 1;
+    if (rec) {
+        /* On recursion, let the caller do the finalization */
+        return rv;
     }
+    if (state->buf) {
+        apr_size_t len;
 
-    *read = bytes_handled;
-    if (*s) {
         /* ensure the string is NUL terminated */
-        (*s)[*read] = '\0';
+        state->buf[state->len] = '\0';
 
         /* PR#43039: We shouldn't accept NULL bytes within the line */
-        bytes_handled = strlen(*s);
-        if (bytes_handled < *read) {
+        len = strlen(state->buf);
+        if (len < state->len) {
             ap_log_data(APLOG_MARK, APLOG_DEBUG, ap_server_conf,
-                        "NULL bytes in header", *s, *read, 0);
-            *read = bytes_handled;
+                        "NULL bytes in header", state->buf, state->len, 0);
             if (rv == APR_SUCCESS) {
                 rv = APR_EINVAL;
             }
+            state->len = len;
         }
     }
+    if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) {
+        state->reusable = 1;
+        rv = APR_EAGAIN;
+    }
+    apr_brigade_cleanup(bb);
     return rv;
 }
 
-AP_DECLARE(apr_status_t) ap_fgetline(char **s, apr_size_t n,
-                                     apr_size_t *read, ap_filter_t *f,
-                                     int flags, apr_bucket_brigade *bb,
-                                     apr_pool_t *p)
+AP_DECLARE(apr_status_t) ap_fgetline_ex(char **s, apr_size_t n,
+                                        apr_size_t *read, ap_filter_t *f,
+                                        int flags, apr_bucket_brigade *bb,
+                                        ap_getline_state_t **state_p,
+                                        apr_pool_t *p)
 {
     apr_status_t rv;
-    
-    rv = ap_fgetline_core(s, n, read, f, flags, bb, p);
+    ap_getline_state_t *state = *state_p;
+#if APR_CHARSET_EBCDIC
+    apr_size_t prev_len = 0;
+#endif
 
+    if (!state || !state->reusable) {
+        if (!state) {
+            *state_p = state = apr_pcalloc(p, sizeof(*state));
+        }
+        else {
+            memset(state, 0, sizeof(*state));
+        }
+        if (*s && !(flags & AP_GETLINE_ALLOC)) {
+            state->buf = *s;
+        }
+        else {
+            state->allocate = 1;
+            *s = NULL;
+        }
+        state->max_size = n;
+    }
+#if APR_CHARSET_EBCDIC
+    else {
+        prev_len = state->len;
+    }
+#endif
+
+    rv = ap_fgetline_core(state, f, flags, bb, p, 0);
+
+    *s = state->buf;
+    *read = state->len;
 #if APR_CHARSET_EBCDIC
     /* On EBCDIC boxes, each complete http protocol input line needs to be
      * translated into the code page used by the compiler.  Since
      * ap_fgetline_core uses recursion, we do the translation in a wrapper
      * function to ensure that each input character gets translated only once.
      */
-    if (*read) {
-        ap_xlate_proto_from_ascii(*s, *read);
+    if (*read > prev_len) {
+        ap_xlate_proto_from_ascii(*s + prev_len, *read - prev_len);
     }
 #endif
 
     return rv;
 }
 
+AP_DECLARE(apr_status_t) ap_fgetline(char **s, apr_size_t n,
+                                     apr_size_t *read, ap_filter_t *f,
+                                     int flags, apr_bucket_brigade *bb,
+                                     apr_pool_t *p)
+{
+    ap_getline_state_t stack_state;
+    ap_getline_state_t *state = &stack_state;
+    state->reusable = 0;
+
+    return ap_fgetline_ex(s, n, read, f, flags, bb, &state, p);
+}
+
 /* Same as ap_fgetline(), working on r's pool and protocol input filters.
  * Pulls from r->proto_input_filters instead of r->input_filters for
  * stricter protocol adherence and better input filter behavior during
@@ -557,22 +646,8 @@ AP_DECLARE(apr_status_t) ap_rgetline(char **s, apr_size_t n,
                                      apr_size_t *read, request_rec *r,
                                      int flags, apr_bucket_brigade *bb)
 {
-    apr_status_t rv;
-
-    rv = ap_fgetline_core(s, n, read, r->proto_input_filters, flags,
-                          bb, r->pool);
-#if APR_CHARSET_EBCDIC
-    /* On EBCDIC boxes, each complete http protocol input line needs to be
-     * translated into the code page used by the compiler.  Since
-     * ap_fgetline_core uses recursion, we do the translation in a wrapper
-     * function to ensure that each input character gets translated only once.
-     */
-    if (*read) {
-        ap_xlate_proto_from_ascii(*s, *read);
-    }
-#endif
-
-    return rv;
+    return ap_fgetline(s, n, read, r->proto_input_filters,
+                       flags, bb, r->pool);
 }
 
 AP_DECLARE(int) ap_getline(char *s, int n, request_rec *r, int flags)
@@ -790,30 +865,40 @@ static int table_do_fn_check_lengths(void *r_, const char *key,
     return 0;
 }
 
-AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb)
+AP_DECLARE(apr_status_t) ap_get_mime_headers_ex(request_rec *r,
+                                                ap_filter_t *f,
+                                                apr_read_type_e block,
+                                                apr_bucket_brigade *bb,
+                                                ap_getline_state_t **state_p)
 {
-    char *last_field = NULL;
-    apr_size_t last_len = 0;
-    apr_size_t alloc_len = 0;
-    char *field;
-    char *value;
-    apr_size_t len;
-    int fields_read = 0;
-    char *tmp_field;
+    apr_status_t rv = APR_SUCCESS;
     core_server_config *conf = ap_get_core_module_config(r->server->module_config);
     int strict = (conf->http_conformance != AP_HTTP_CONFORMANCE_UNSAFE);
+    apr_size_t max_size = r->server->limit_req_fieldsize + 1;
+    int flags = AP_GETLINE_ALLOC | AP_GETLINE_FOLD_COL;
+    int fields_read = 0;
+
+    if (strict) {
+        flags |= AP_GETLINE_CRLF;
+    }
+    if (block == APR_NONBLOCK_READ) {
+        flags |= AP_GETLINE_NONBLOCK;
+    }
 
     /*
      * Read header lines until we get the empty separator line, a read error,
      * the connection closes (EOF), reach the server limit, or we timeout.
      */
     while(1) {
-        apr_status_t rv;
-
-        field = NULL;
-        rv = ap_rgetline(&field, r->server->limit_req_fieldsize + 2,
-                         &len, r, strict ? AP_GETLINE_CRLF : 0, bb);
+        char *field = NULL;
+        apr_size_t len = 0;
 
+        /* max_size + 2 for CRLF */
+        rv = ap_fgetline_ex(&field, max_size + 2, &len, f, flags, bb,
+                            state_p, r->pool);
+        if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) {
+            goto cleanup;
+        }
         if (rv != APR_SUCCESS) {
             if (APR_STATUS_IS_TIMEUP(rv)) {
                 r->status = HTTP_REQUEST_TIME_OUT;
@@ -822,7 +907,7 @@ AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb
                 r->status = HTTP_BAD_REQUEST;
             }
 
-            /* ap_rgetline returns APR_ENOSPC if it fills up the buffer before
+            /* ap_fgetline returns APR_ENOSPC if it fills up the buffer before
              * finding the end-of-line.  This is only going to happen if it
              * exceeds the configured limit for a field size.
              */
@@ -837,7 +922,12 @@ AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb
                               (field) ? field_name_len(field) : 0,
                               (field) ? field : "");
             }
-            return;
+            goto cleanup;
+        }
+
+        /* Found the terminating empty end-of-headers line, stop. */
+        if (len == 0) {
+            break;
         }
 
         /* For all header values, and all obs-fold lines, the presence of
@@ -849,82 +939,11 @@ AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb
             field[--len] = '\0';
         } 
 
-        if (*field == '\t' || *field == ' ') {
-
-            /* Append any newly-read obs-fold line onto the preceding
-             * last_field line we are processing
-             */
-            apr_size_t fold_len;
-
-            if (last_field == NULL) {
-                r->status = HTTP_BAD_REQUEST;
-                ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(03442)
-                              "Line folding encountered before first"
-                              " header line");
-                return;
-            }
-
-            if (field[1] == '\0') {
-                r->status = HTTP_BAD_REQUEST;
-                ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(03443)
-                              "Empty folded line encountered");
-                return;
-            }
-
-            /* Leading whitespace on an obs-fold line can be
-             * similarly discarded */
-            while (field[1] == '\t' || field[1] == ' ') {
-                ++field; --len;
-            }
-
-            /* This line is a continuation of the preceding line(s),
-             * so append it to the line that we've set aside.
-             * Note: this uses a power-of-two allocator to avoid
-             * doing O(n) allocs and using O(n^2) space for
-             * continuations that span many many lines.
-             */
-            fold_len = last_len + len + 1; /* trailing null */
-
-            if (fold_len >= (apr_size_t)(r->server->limit_req_fieldsize)) {
-                r->status = HTTP_BAD_REQUEST;
-                /* report what we have accumulated so far before the
-                 * overflow (last_field) as the field with the problem
-                 */
-                apr_table_setn(r->notes, "error-notes",
-                               "Size of a request header field "
-                               "exceeds server limit.");
-                ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, APLOGNO(00562)
-                              "Request header exceeds LimitRequestFieldSize "
-                              "after folding: %.*s",
-                              field_name_len(last_field), last_field);
-                return;
-            }
-
-            if (fold_len > alloc_len) {
-                char *fold_buf;
-                alloc_len += alloc_len;
-                if (fold_len > alloc_len) {
-                    alloc_len = fold_len;
-                }
-                fold_buf = (char *)apr_palloc(r->pool, alloc_len);
-                memcpy(fold_buf, last_field, last_len);
-                last_field = fold_buf;
-            }
-            memcpy(last_field + last_len, field, len +1); /* +1 for nul */
-            /* Replace obs-fold w/ SP per RFC 7230 3.2.4 */
-            last_field[last_len] = ' ';
-            last_len += len;
-
-            /* We've appended this obs-fold line to last_len, proceed to
-             * read the next input line
-             */
-            continue;
-        }
-        else if (last_field != NULL) {
+        {
+            char *value;
 
-            /* Process the previous last_field header line with all obs-folded
-             * segments already concatenated (this is not operating on the
-             * most recently read input line).
+            /* Process the header line with all obs-folded segments already
+             * concatenated.
              */
 
             if (r->server->limit_req_fields
@@ -936,37 +955,40 @@ AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb
                 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, APLOGNO(00563)
                               "Number of request headers exceeds "
                               "LimitRequestFields");
-                return;
+                rv = APR_ENOSPC;
+                goto cleanup;
             }
 
-            if (!strict)
-            {
+            if (!strict) {
                 /* Not Strict ('Unsafe' mode), using the legacy parser */
 
-                if (!(value = strchr(last_field, ':'))) { /* Find ':' or */
+                if (!(value = strchr(field, ':'))) { /* Find ':' or */
                     r->status = HTTP_BAD_REQUEST;   /* abort bad request */
                     ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(00564)
                                   "Request header field is missing ':' "
                                   "separator: %.*s", (int)LOG_NAME_MAX_LEN,
-                                  last_field);
-                    return;
+                                  field);
+                    rv = APR_EINVAL;
+                    goto cleanup;
                 }
 
-                if (value == last_field) {
+                if (value == field) {
                     r->status = HTTP_BAD_REQUEST;
                     ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(03453)
                                   "Request header field name was empty");
-                    return;
+                    rv = APR_EINVAL;
+                    goto cleanup;
                 }
 
                 *value++ = '\0'; /* NUL-terminate at colon */
 
-                if (strpbrk(last_field, "\t\n\v\f\r ")) {
+                if (strpbrk(field, "\t\n\v\f\r ")) {
                     r->status = HTTP_BAD_REQUEST;
                     ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(03452)
                                   "Request header field name presented"
                                   " invalid whitespace");
-                    return;
+                    rv = APR_EINVAL;
+                    goto cleanup;
                 }
 
                 while (*value == ' ' || *value == '\t') {
@@ -978,64 +1000,51 @@ AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb
                     ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(03451)
                                   "Request header field value presented"
                                   " bad whitespace");
-                    return;
+                    rv = APR_EINVAL;
+                    goto cleanup;
                 }
             }
-            else /* Using strict RFC7230 parsing */
-            {
+            else {
+                /* Using strict RFC7230 parsing */
+
                 /* Ensure valid token chars before ':' per RFC 7230 3.2.4 */
-                value = (char *)ap_scan_http_token(last_field);
-                if ((value == last_field) || *value != ':') {
+                value = (char *)ap_scan_http_token(field);
+                if ((value == field) || *value != ':') {
                     r->status = HTTP_BAD_REQUEST;
                     ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(02426)
                                   "Request header field name is malformed: "
-                                  "%.*s", (int)LOG_NAME_MAX_LEN, last_field);
-                    return;
+                                  "%.*s", (int)LOG_NAME_MAX_LEN, field);
+                    rv = APR_EINVAL;
+                    goto cleanup;
                 }
 
-                *value++ = '\0'; /* NUL-terminate last_field name at ':' */
+                *value++ = '\0'; /* NUL-terminate field name at ':' */
 
                 while (*value == ' ' || *value == '\t') {
                     ++value;     /* Skip LWS of value */
                 }
 
-                /* Find invalid, non-HT ctrl char, or the trailing NULL */
-                tmp_field = (char *)ap_scan_http_field_content(value);
-
                 /* Reject value for all garbage input (CTRLs excluding HT)
                  * e.g. only VCHAR / SP / HT / obs-text are allowed per
                  * RFC7230 3.2.6 - leave all more explicit rule enforcement
                  * for specific header handler logic later in the cycle
                  */
-                if (*tmp_field != '\0') {
+                if (*ap_scan_http_field_content(value) != '\0') {
                     r->status = HTTP_BAD_REQUEST;
                     ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(02427)
                                   "Request header value is malformed: "
                                   "%.*s", (int)LOG_NAME_MAX_LEN, value);
-                    return;
+                    rv = APR_EINVAL;
+                    goto cleanup;
                 }
             }
 
-            apr_table_addn(r->headers_in, last_field, value);
+            apr_table_addn(r->headers_in, field, value);
 
-            /* This last_field header is now stored in headers_in,
+            /* This field header is now stored in headers_in,
              * resume processing of the current input line.
              */
         }
-
-        /* Found the terminating empty end-of-headers line, stop. */
-        if (len == 0) {
-            break;
-        }
-
-        /* Keep track of this new header line so that we can extend it across
-         * any obs-fold or parse it on the next loop iteration. We referenced
-         * our previously allocated buffer in r->headers_in,
-         * so allocate a fresh buffer if required.
-         */
-        alloc_len = 0;
-        last_field = field;
-        last_len = len;
     }
 
     /* Combine multiple message-header fields with the same
@@ -1045,14 +1054,25 @@ AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb
 
     /* enforce LimitRequestFieldSize for merged headers */
     apr_table_do(table_do_fn_check_lengths, r, r->headers_in, NULL);
+
+cleanup:
+    apr_brigade_cleanup(bb);
+    return rv;
+}
+
+AP_DECLARE(void) ap_get_mime_headers_core(request_rec *r, apr_bucket_brigade *bb)
+{
+    ap_getline_state_t *state = NULL;
+    (void)ap_get_mime_headers_ex(r, r->proto_input_filters, APR_BLOCK_READ,
+                                 bb, &state);
 }
 
 AP_DECLARE(void) ap_get_mime_headers(request_rec *r)
 {
-    apr_bucket_brigade *tmp_bb;
-    tmp_bb = apr_brigade_create(r->pool, r->connection->bucket_alloc);
+    conn_rec *c = r->connection;
+    apr_bucket_brigade *tmp_bb = ap_acquire_brigade(c);
     ap_get_mime_headers_core(r, tmp_bb);
-    apr_brigade_destroy(tmp_bb);
+    ap_release_brigade(c, tmp_bb);
 }
 
 AP_DECLARE(request_rec *) ap_create_request(conn_rec *conn)
@@ -1305,23 +1325,42 @@ AP_DECLARE(int) ap_assign_request_line(request_rec *r,
 
 AP_DECLARE(request_rec *) ap_read_request(conn_rec *conn)
 {
+    request_rec *r = NULL;
+    (void)ap_read_request_ex(&r, conn, APR_BLOCK_READ);
+    return r;
+}
+
+AP_DECLARE(apr_status_t) ap_read_request_ex(request_rec **out_r, conn_rec *conn,
+                                            apr_read_type_e block)
+{
+    apr_status_t rv;
     int access_status;
     apr_bucket_brigade *tmp_bb;
-    apr_bucket *e, *bdata = NULL, *berr = NULL;
+    apr_bucket *e, *bdata = NULL;
+    ap_bucket_error *berr = NULL;
     ap_bucket_request *breq = NULL;
     const char *method, *uri, *protocol;
     apr_table_t *headers;
-    apr_status_t rv;
-
-    request_rec *r = ap_create_request(conn);
+    request_rec *r;
 
-    tmp_bb = apr_brigade_create(r->pool, r->connection->bucket_alloc);
-    conn->keepalive = AP_CONN_UNKNOWN;
+    r = conn->partial_request;
+    if (conn->keepalive == AP_CONN_KEEPALIVE) {
+        conn->keepalive = AP_CONN_UNKNOWN;
+    }
+    if (!r) {
+        r = ap_create_request(conn);
+        ap_run_pre_read_request(r, conn);
+        r->request_time = apr_time_now();
+    }
 
-    ap_run_pre_read_request(r, conn);
+    tmp_bb = ap_acquire_brigade(conn);
 
-    r->request_time = apr_time_now();
-    rv = ap_get_brigade(r->proto_input_filters, tmp_bb, AP_MODE_READBYTES, APR_BLOCK_READ, 0);
+    rv = ap_get_brigade(r->proto_input_filters, tmp_bb, AP_MODE_READBYTES, block, 0);
+    if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) {
+        conn->partial_request = r;
+        r = NULL;
+        goto done;
+    }
     if (rv != APR_SUCCESS || APR_BRIGADE_EMPTY(tmp_bb)) {
         /* Not worth dying with. */
         conn->keepalive = AP_CONN_CLOSE;
@@ -1337,7 +1376,7 @@ AP_DECLARE(request_rec *) ap_read_request(conn_rec *conn)
             if (!breq) breq = e->data;
         }
         else if (AP_BUCKET_IS_ERROR(e)) {
-            if (!berr) berr = e;
+            if (!berr) berr = e->data;
         }
         else if (!APR_BUCKET_IS_METADATA(e) && e->length != 0) {
             if (!bdata) bdata = e;
@@ -1345,16 +1384,11 @@ AP_DECLARE(request_rec *) ap_read_request(conn_rec *conn)
         }
     }
 
-    if (!breq && !berr) {
-        ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(10389)
-                      "request failed: neither request bucket nor error at start of input");
-        access_status = HTTP_INTERNAL_SERVER_ERROR;
-        goto die_unusable_input;
-    }
-
+    /* If there is a request, we always process it, as it defines
+     * the context in which a potential error bucket is handled. */
     if (breq) {
-        /* If there is a request, we always process it, as it defines
-         * the context in which a potential error bucket is handled. */
+        conn->partial_request = NULL;
+
         if (apr_pool_is_ancestor(r->pool, breq->pool)) {
             method = breq->method;
             uri = breq->uri;
@@ -1369,8 +1403,7 @@ AP_DECLARE(request_rec *) ap_read_request(conn_rec *conn)
         }
 
         if (!method || !uri || !protocol) {
-            access_status = berr? ((ap_bucket_error *)(berr->data))->status :
-                                  HTTP_INTERNAL_SERVER_ERROR;
+            access_status = berr ? berr->status : HTTP_INTERNAL_SERVER_ERROR;
             goto die_unusable_input;
         }
 
@@ -1414,20 +1447,31 @@ AP_DECLARE(request_rec *) ap_read_request(conn_rec *conn)
             goto ignore;
         }
     }
-
     if (berr) {
-        access_status = ((ap_bucket_error *)(berr->data))->status;
+        /* APLOG_ERR already raised by filters (eventually). */
+        ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(10467)
+                      "request failed: error %i at start of input",
+                      berr->status);
+        access_status = berr->status;
         goto die_unusable_input;
     }
-    else if (bdata) {
+    if (!breq) {
+        ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(10389)
+                      "request failed: neither request bucket nor error "
+                      "at start of input");
+        access_status = HTTP_INTERNAL_SERVER_ERROR;
+        goto die_unusable_input;
+    }
+    if (bdata) {
         /* Since processing of a request body depends on knowing the request, we
          * cannot handle any data here. For example, chunked-encoding filters are
          * added after the request is read, so any data buckets here will not
          * have been de-chunked.
          */
         ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(10391)
-                      "request failed: seeing DATA bucket(len=%d) of request "
-                      "body, too early to process", (int)bdata->length);
+                      "request failed: seeing DATA bucket (len=%" APR_SIZE_T_FMT ") "
+                      "of request body, too early to process",
+                      bdata->length);
         access_status = HTTP_INTERNAL_SERVER_ERROR;
         goto die_unusable_input;
     }
@@ -1480,7 +1524,9 @@ AP_DECLARE(request_rec *) ap_read_request(conn_rec *conn)
     AP_READ_REQUEST_SUCCESS((uintptr_t)r, (char *)r->method,
                             (char *)r->uri, (char *)r->server->defn_name,
                             r->status);
-    return r;
+done:
+    ap_release_brigade(conn, tmp_bb);
+    return (*out_r = r) ? APR_SUCCESS : APR_EAGAIN;
 
     /* Everything falls through on failure */
 
@@ -1523,9 +1569,10 @@ AP_DECLARE(request_rec *) ap_read_request(conn_rec *conn)
     }
 
 ignore:
-    r = NULL;
+    ap_release_brigade(conn, tmp_bb);
+    *out_r = conn->partial_request = r = NULL;
     AP_READ_REQUEST_FAILURE((uintptr_t)r);
-    return NULL;
+    return APR_EGENERAL;
 }
 
 AP_DECLARE(int) ap_post_read_request(request_rec *r)

From 6eee3f3c338292f4782d71bd07094e90edfe18d7 Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Thu, 11 Jul 2024 15:24:36 +0200
Subject: [PATCH 21/22] mod_proxy,mpm_event: Replace
 ap_mpm_register_poll_callback*() by ap_mpm_poll_suspended() to avoid races.

---
 include/ap_mmn.h                   |   8 +-
 include/ap_mpm.h                   |  51 +--
 include/httpd.h                    |   2 +-
 include/mpm_common.h               |  18 +-
 modules/http/http_core.c           |  20 +-
 modules/proxy/mod_proxy_http.c     | 266 ++++++++++------
 modules/proxy/mod_proxy_wstunnel.c | 220 +++++++------
 modules/proxy/proxy_util.c         |   2 +-
 server/mpm/event/event.c           | 487 ++++++++++++++++-------------
 server/mpm_common.c                |  39 +--
 server/mpm_fdqueue.h               |   1 -
 11 files changed, 605 insertions(+), 509 deletions(-)

diff --git a/include/ap_mmn.h b/include/ap_mmn.h
index fb8f4512d47..aac4e1a3401 100644
--- a/include/ap_mmn.h
+++ b/include/ap_mmn.h
@@ -735,14 +735,18 @@
  *                         ap_check_output_pending()
  * 20211221.27 (2.5.1-dev) Add min_connection_timeout hook and
  *                         ap_get_connection_timeout()
+ * 20211221.28 (2.5.1-dev) Add ap_mpm_poll_suspended() and
+ *                         AP_MPMQ_CAN_POLL_SUSPENDED
+ * 20240701.0 (2.5.1-dev)  Axe ap_mpm_register_poll_callback and
+ *                         ap_mpm_register_poll_callback_timeout
  */
 
 #define MODULE_MAGIC_COOKIE 0x41503235UL /* "AP25" */
 
 #ifndef MODULE_MAGIC_NUMBER_MAJOR
-#define MODULE_MAGIC_NUMBER_MAJOR 20211221
+#define MODULE_MAGIC_NUMBER_MAJOR 20240701
 #endif
-#define MODULE_MAGIC_NUMBER_MINOR 27             /* 0...n */
+#define MODULE_MAGIC_NUMBER_MINOR 0      /* 0...n */
 
 /**
  * Determine if the server's current MODULE_MAGIC_NUMBER is at least a
diff --git a/include/ap_mpm.h b/include/ap_mpm.h
index f2fd436d508..9a7ec6eeaa3 100644
--- a/include/ap_mpm.h
+++ b/include/ap_mpm.h
@@ -184,6 +184,8 @@ AP_DECLARE(apr_status_t) ap_os_create_privileged_process(
 #define AP_MPMQ_CAN_POLL             18
 /** MPM supports CONN_STATE_ASYNC_WAITIO */
 #define AP_MPMQ_CAN_WAITIO           19
+/** MPM implements the poll_suspended hook */
+#define AP_MPMQ_CAN_POLL_SUSPENDED   20
 /** @} */
 
 /**
@@ -206,54 +208,13 @@ typedef void (ap_mpm_callback_fn_t)(void *baton);
 /* only added support in the Event MPM....  check for APR_ENOTIMPL */
 AP_DECLARE(apr_status_t) ap_mpm_resume_suspended(conn_rec *c);
 /* only added support in the Event MPM....  check for APR_ENOTIMPL */
+AP_DECLARE(apr_status_t) ap_mpm_poll_suspended(conn_rec *c, apr_pool_t *p,
+                                               const apr_array_header_t *pfds,
+                                               apr_interval_time_t timeout);
+/* only added support in the Event MPM....  check for APR_ENOTIMPL */
 AP_DECLARE(apr_status_t) ap_mpm_register_timed_callback(
         apr_time_t t, ap_mpm_callback_fn_t *cbfn, void *baton);
 
-/**
- * Register a callback on the readability or writability on a group of
- * sockets/pipes.
- * @param p Pool used by the MPM for its internal allocations
- * @param pfds Array of apr_pollfd_t
- * @param cbfn The callback function
- * @param baton userdata for the callback function
- * @return APR_SUCCESS if all sockets/pipes could be added to a pollset,
- * APR_ENOTIMPL if no asynch support, or an apr_pollset_add error.
- * @remark When activity is found on any 1 socket/pipe in the list, all are removed
- * from the pollset and only 1 callback is issued.
- * @remark The passed in pool can be cleared by cbfn and tofn when called back,
- *         it retains no MPM persistent data and won't be used until the next call
- *         to ap_mpm_register_poll_callback[_timeout].
- */
-
-AP_DECLARE(apr_status_t) ap_mpm_register_poll_callback(
-        apr_pool_t *p, const apr_array_header_t *pfds,
-        ap_mpm_callback_fn_t *cbfn, void *baton);
-
-/**
- * Register a callback on the readability or writability on a group of sockets/pipes,
- * with a timeout.
- * @param p Pool used by the MPM for its internal allocations
- * @param pfds Array of apr_pollfd_t
- * @param cbfn The callback function
- * @param tofn The callback function if the timeout expires
- * @param baton userdata for the callback function
- * @param timeout timeout for I/O in microseconds, unlimited if <= 0
- * @return APR_SUCCESS if all sockets/pipes could be added to a pollset,
- * APR_ENOTIMPL if no asynch support, or an apr_pollset_add error.
- * @remark When activity is found on any 1 socket/pipe in the list, all are removed
- * from the pollset and only 1 callback is issued. 
- * @remark For each call, only one of tofn or cbfn will be called, never both.
- * @remark The passed in pool can be cleared by cbfn and tofn when called back,
- *         it retains no MPM persistent data and won't be used until the next call
- *         to ap_mpm_register_poll_callback[_timeout].
- */
-
-AP_DECLARE(apr_status_t) ap_mpm_register_poll_callback_timeout(
-        apr_pool_t *p, const apr_array_header_t *pfds,
-        ap_mpm_callback_fn_t *cbfn, ap_mpm_callback_fn_t *tofn,
-        void *baton, apr_time_t timeout);
-
-
 typedef enum mpm_child_status {
     MPM_CHILD_STARTED,
     MPM_CHILD_EXITED,
diff --git a/include/httpd.h b/include/httpd.h
index ae08740b227..931f5fff49a 100644
--- a/include/httpd.h
+++ b/include/httpd.h
@@ -1334,7 +1334,7 @@ typedef enum  {
     CONN_STATE_PROCESSING,          /* Processed by process_connection hooks */
     CONN_STATE_HANDLER,             /* Processed by the modules handlers */
     CONN_STATE_WRITE_COMPLETION,    /* Flushed by the MPM before entering CONN_STATE_KEEPALIVE */
-    CONN_STATE_SUSPENDED,           /* Suspended in the MPM until ap_run_resume_suspended() */
+    CONN_STATE_SUSPENDED,           /* Suspended from the MPM until ap_run_resume_suspended() */
     CONN_STATE_LINGER,              /* MPM flushes then closes the connection with lingering */
     CONN_STATE_LINGER_NORMAL,       /* MPM has started lingering close with normal timeout */
     CONN_STATE_LINGER_SHORT,        /* MPM has started lingering close with short timeout */
diff --git a/include/mpm_common.h b/include/mpm_common.h
index 34c61e2a6c2..43320b2b5c9 100644
--- a/include/mpm_common.h
+++ b/include/mpm_common.h
@@ -422,22 +422,12 @@ AP_DECLARE_HOOK(int, mpm_query, (int query_code, int *result, apr_status_t *rv))
 AP_DECLARE_HOOK(apr_status_t, mpm_register_timed_callback,
                 (apr_time_t t, ap_mpm_callback_fn_t *cbfn, void *baton))
 
-/**
- * register the specified callback
- * @ingroup hooks
- */
-AP_DECLARE_HOOK(apr_status_t, mpm_register_poll_callback,
-                (apr_pool_t *p, const apr_array_header_t *pds,
-                 ap_mpm_callback_fn_t *cbfn, void *baton))
-
-/* register the specified callback, with timeout 
+/** Put suspended connection's pollfds into the MPM's pollset
  * @ingroup hooks
- *
  */
-AP_DECLARE_HOOK(apr_status_t, mpm_register_poll_callback_timeout,
-                (apr_pool_t *p, const apr_array_header_t *pds,
-                ap_mpm_callback_fn_t *cbfn, ap_mpm_callback_fn_t *tofn,
-                void *baton, apr_time_t timeout))
+AP_DECLARE_HOOK(apr_status_t, mpm_poll_suspended,
+                (conn_rec *c, apr_pool_t *p, const apr_array_header_t *pfds,
+                 apr_interval_time_t timeout))
 
 /** Resume the suspended connection 
  * @ingroup hooks
diff --git a/modules/http/http_core.c b/modules/http/http_core.c
index 7e9f82f87dd..92a472d3fa7 100644
--- a/modules/http/http_core.c
+++ b/modules/http/http_core.c
@@ -182,20 +182,22 @@ static int ap_process_http_async_connection(conn_rec *c)
                  * of nondeterministic failures later.
                  */
                 r = NULL;
-            }
 
-            if (cs->state != CONN_STATE_WRITE_COMPLETION &&
-                cs->state != CONN_STATE_SUSPENDED &&
-                cs->state != CONN_STATE_LINGER) {
-                /* Something went wrong; close the connection */
-                cs->state = CONN_STATE_LINGER;
+                switch (cs->state) {
+                case CONN_STATE_WRITE_COMPLETION:
+                case CONN_STATE_SUSPENDED:
+                case CONN_STATE_LINGER:
+                    return OK;
+                default:
+                    /* Unexpected, close */
+                    break;
+                }
             }
         }
-        else {   /* ap_read_request failed - client may have closed */
-            cs->state = CONN_STATE_LINGER;
-        }
     }
 
+    /* Something went wrong; close the connection */
+    cs->state = CONN_STATE_LINGER;
     return OK;
 }
 
diff --git a/modules/proxy/mod_proxy_http.c b/modules/proxy/mod_proxy_http.c
index 38da5b0f7f6..66a66af7949 100644
--- a/modules/proxy/mod_proxy_http.c
+++ b/modules/proxy/mod_proxy_http.c
@@ -19,9 +19,12 @@
 #include "mod_proxy.h"
 #include "ap_regex.h"
 #include "ap_mpm.h"
+#include "mpm_common.h"
 
 module AP_MODULE_DECLARE_DATA proxy_http_module;
 
+static int mpm_can_poll_suspended = 0;
+
 static int (*ap_proxy_clear_connection_fn)(request_rec *r, apr_table_t *headers) =
         NULL;
 
@@ -275,12 +278,6 @@ static void add_cl(apr_pool_t *p,
 
 #define MAX_MEM_SPOOL 16384
 
-typedef enum {
-    PROXY_HTTP_REQ_HAVE_HEADER = 0,
-
-    PROXY_HTTP_TUNNELING
-} proxy_http_state;
-
 typedef enum {
     RB_INIT = 0,
     RB_STREAM_CL,
@@ -307,7 +304,6 @@ typedef struct {
     char *old_cl_val, *old_te_val;
     apr_off_t cl_val;
 
-    proxy_http_state state;
     rb_methods rb_method;
 
     const char *upgrade;
@@ -316,108 +312,148 @@ typedef struct {
     apr_pool_t *async_pool;
     apr_interval_time_t idle_timeout;
 
-    unsigned int can_go_async           :1,
+    unsigned int can_suspend            :1,
                  do_100_continue        :1,
                  prefetch_nonblocking   :1,
-                 force10                :1;
+                 force10                :1,
+                 suspended              :1,
+                 upgraded               :1;
 } proxy_http_req_t;
 
-static void proxy_http_async_finish(proxy_http_req_t *req)
+static int proxy_http_tunnel_pump(proxy_http_req_t *req)
+{
+    int status = ap_proxy_tunnel_run(req->tunnel);
+    if (status == HTTP_GATEWAY_TIME_OUT) {
+        if (!req->can_suspend) {
+            /* ap_proxy_tunnel_run() didn't log this */
+            ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, req->r, APLOGNO()
+                          "proxy: %s tunneling timed out",
+                          req->proto);
+        }
+        else {
+            status = SUSPENDED;
+        }
+    }
+    return status;
+}
+
+/* The backend and SUSPENDED client connections are done,
+ * release them (the latter in the MPM).
+ */
+static void proxy_http_async_done(proxy_http_req_t *req, int cancelled)
 { 
-    conn_rec *c = req->r->connection;
+    request_rec *r = req->r;
+    conn_rec *c = r->connection;
+    proxy_conn_rec *backend = req->backend;
+    proxy_tunnel_rec *tunnel = req->tunnel;
+    int reusable = (!cancelled && !req->upgraded);
+
+    ap_log_rerror(APLOG_MARK, APLOG_TRACE5, 0, r, "proxy %s: %s async",
+                  req->proto, cancelled ? "cancel" : "finish");
+
+    if (req->async_pool) {
+        apr_pool_destroy(req->async_pool);
+        req->async_pool = NULL;
+    }
 
-    ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, req->r,
-                  "proxy %s: finish async", req->proto);
+    if (!reusable) {
+        c->keepalive = AP_CONN_CLOSE;
+        backend->close = 1;
+    }
 
     /* Report bytes exchanged by the backend */
-    req->backend->worker->s->read +=
-        ap_proxy_tunnel_conn_bytes_in(req->tunnel->origin);
-    req->backend->worker->s->transferred +=
-        ap_proxy_tunnel_conn_bytes_out(req->tunnel->origin);
+    backend->worker->s->read +=
+        ap_proxy_tunnel_conn_bytes_in(tunnel->origin);
+    backend->worker->s->transferred +=
+        ap_proxy_tunnel_conn_bytes_out(tunnel->origin);
 
-    proxy_run_detach_backend(req->r, req->backend);
-    ap_proxy_release_connection(req->proto, req->backend, req->r->server);
+    proxy_run_detach_backend(r, backend);
+    ap_proxy_release_connection(req->proto, backend, r->server);
 
-    ap_finalize_request_protocol(req->r);
-    ap_process_request_after_handler(req->r);
-    /* don't touch req or req->r from here */
+    ap_finalize_request_protocol(r);
+    ap_process_request_after_handler(r);
+    /* don't dereference req or r from here! */
 
-    c->cs->state = CONN_STATE_LINGER;
+    /* Return the client connection to the MPM */
+    if (reusable) {
+        c->cs->state = CONN_STATE_WRITE_COMPLETION;
+    }
+    else {
+        c->cs->state = CONN_STATE_LINGER;
+    }
     ap_mpm_resume_suspended(c);
 }
 
-/* If neither socket becomes readable in the specified timeout,
- * this callback will kill the request.
- * We do not have to worry about having a cancel and a IO both queued.
- */
-static void proxy_http_async_cancel_cb(void *baton)
-{ 
-    proxy_http_req_t *req = (proxy_http_req_t *)baton;
-
-    ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, req->r,
-                  "proxy %s: cancel async", req->proto);
-
-    req->r->connection->keepalive = AP_CONN_CLOSE;
-    req->backend->close = 1;
-    proxy_http_async_finish(req);
-}
+/* Tell the MPM to poll the connections and resume when ready */
+static void proxy_http_async_poll(proxy_http_req_t *req)
+{
+    conn_rec *c = req->r->connection;
+    proxy_tunnel_rec *tunnel = req->tunnel;
 
-/* Invoked by the event loop when data is ready on either end. 
- * We don't need the invoke_mtx, since we never put multiple callback events
- * in the queue.
- */
-static void proxy_http_async_cb(void *baton)
-{ 
-    proxy_http_req_t *req = (proxy_http_req_t *)baton;
-    int status;
+    ap_log_rerror(APLOG_MARK, APLOG_TRACE5, 0, req->r,
+                  "proxy %s: going async", req->proto);
 
+    /* Create/clear the subpool used by the MPM to allocate
+     * the temporary data needed for this polling.
+     */
     if (req->async_pool) {
-        /* Clear MPM's temporary data */
         apr_pool_clear(req->async_pool);
     }
+    else {
+        apr_pool_create(&req->async_pool, req->p);
+    }
 
-    switch (req->state) {
-    case PROXY_HTTP_TUNNELING:
-        /* Pump both ends until they'd block and then start over again */
-        status = ap_proxy_tunnel_run(req->tunnel);
-        if (status == HTTP_GATEWAY_TIME_OUT) {
-            status = SUSPENDED;
-        }
-        break;
+    ap_mpm_poll_suspended(c, req->async_pool, tunnel->pfds, req->idle_timeout);
+}
 
-    default:
-        ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, req->r,
-                      "proxy %s: unexpected async state (%i)",
-                      req->proto, (int)req->state);
-        status = HTTP_INTERNAL_SERVER_ERROR;
-        break;
-    }
+/* The resume_connection hook called by the MPM when async polling completes (or times out) */
+static void proxy_http_resume_connection(conn_rec *c, request_rec *r)
+{
+    proxy_http_req_t *req = NULL;
+    int status;
 
-    if (status == SUSPENDED) {
-        ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, req->r,
-                      "proxy %s: suspended, going async",
-                      req->proto);
-
-        if (!req->async_pool) {
-            /* Create the subpool used by the MPM to alloc its own
-             * temporary data, which we want to clear on the next
-             * round (above) to avoid leaks.
-             */
-            apr_pool_create(&req->async_pool, req->p);
-        }
+    if (r) {
+        req = ap_get_module_config(r->request_config, &proxy_http_module);
+    }
+    if (!req || !req->suspended) {
+        return;
+    }
+    ap_assert(req->r == r);
 
-        ap_mpm_register_poll_callback_timeout(req->async_pool,
-                                              req->tunnel->pfds,
-                                              proxy_http_async_cb, 
-                                              proxy_http_async_cancel_cb, 
-                                              req, req->idle_timeout);
+    if (c->cs->state == CONN_STATE_SUSPENDED) {
+        status = proxy_http_tunnel_pump(req);
+    }
+    else {
+        AP_DEBUG_ASSERT(c->cs->state == CONN_STATE_LINGER);
+        ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO()
+                      "proxy: %s async tunneling timed out (state %i)",
+                      req->proto, c->cs->state);
+        status = DONE;
     }
-    else if (ap_is_HTTP_ERROR(status)) {
-        proxy_http_async_cancel_cb(req);
+    if (status == SUSPENDED) {
+        /* Keep polling in the MPM */
+        proxy_http_async_poll(req);
     }
     else {
-        proxy_http_async_finish(req);
+        /* Done with tunneling */
+        proxy_http_async_done(req, status != OK);
+    }
+}
+
+/* The suspend_connection hook called once the MPM gets the SUSPENDED connection */
+static void proxy_http_suspend_connection(conn_rec *c, request_rec *r)
+{
+    proxy_http_req_t *req = NULL;
+
+    if (r) {
+        req = ap_get_module_config(r->request_config, &proxy_http_module);
     }
+    if (!req || !req->suspended) {
+        return;
+    }
+    ap_assert(req->r == r);
+
+    proxy_http_async_poll(req);
 }
 
 static int stream_reqbody(proxy_http_req_t *req)
@@ -1553,22 +1589,40 @@ int ap_proxy_http_process_response(proxy_http_req_t *req)
                               "can't create tunnel for %s", upgrade);
                 return HTTP_INTERNAL_SERVER_ERROR;
             }
+            if (req->can_suspend) {
+                /* If the MPM allows async polling, this thread will tunnel
+                 * all it can now so long as it's not timeouting on the (short)
+                 * async delay, returning to the MPM otherwise to get scheduled
+                 * again when the connections are ready.
+                 */
+                req->tunnel->timeout = dconf->async_delay;
+            }
+            else {
+                /* If the MPM doesn't allow async polling, the full tunneling
+                 * happens now in this thread and timeouting is a showstopper.
+                 */
+                req->tunnel->timeout = req->idle_timeout;
+            }
 
             r->status = HTTP_SWITCHING_PROTOCOLS;
             req->proto = upgrade;
-
-            if (req->can_go_async) {
-                /* Let the MPM schedule the work when idle */
-                req->state = PROXY_HTTP_TUNNELING;
-                req->tunnel->timeout = dconf->async_delay;
-                proxy_http_async_cb(req);
+            req->upgraded = 1;
+
+            status = proxy_http_tunnel_pump(req);
+            if (status == SUSPENDED) {
+                /* Let the MPM call proxy_http_suspend_connection() when
+                 * the connection is returned to it (i.e. not handled anywhere
+                 * else anymore). This prevents the connection from being seen
+                 * or handled by multiple threads at the same time, which could
+                 * happen if we'd call ap_mpm_poll_suspended() directly from
+                 * here, during the time for the connection to actually reaches
+                 * the MPM whilst a new IO causes the connection to be
+                 * rescheduled quickly.
+                 */
+                req->suspended = 1;
                 return SUSPENDED;
             }
 
-            /* Let proxy tunnel forward everything within this thread */
-            req->tunnel->timeout = req->idle_timeout;
-            status = ap_proxy_tunnel_run(req->tunnel);
-
             /* Report bytes exchanged by the backend */
             backend->worker->s->read +=
                 ap_proxy_tunnel_conn_bytes_in(req->tunnel->origin);
@@ -1932,7 +1986,6 @@ static int proxy_http_handler(request_rec *r, proxy_worker *worker,
     proxy_http_req_t *req = NULL;
     proxy_conn_rec *backend = NULL;
     apr_bucket_brigade *input_brigade = NULL;
-    int mpm_can_poll = 0;
     int is_ssl = 0;
     conn_rec *c = r->connection;
     proxy_dir_conf *dconf;
@@ -1972,7 +2025,6 @@ static int proxy_http_handler(request_rec *r, proxy_worker *worker,
     backend->is_ssl = is_ssl;
 
     dconf = ap_get_module_config(r->per_dir_config, &proxy_module);
-    ap_mpm_query(AP_MPMQ_CAN_POLL, &mpm_can_poll);
 
     req = apr_pcalloc(p, sizeof(*req));
     req->p = p;
@@ -1983,12 +2035,13 @@ static int proxy_http_handler(request_rec *r, proxy_worker *worker,
     req->backend = backend;
     req->proto = scheme;
     req->bucket_alloc = c->bucket_alloc;
-    req->can_go_async = (mpm_can_poll &&
-                         dconf->async_delay_set &&
-                         dconf->async_delay >= 0);
-    req->state = PROXY_HTTP_REQ_HAVE_HEADER;
+    req->can_suspend = (mpm_can_poll_suspended &&
+                        dconf->async_delay_set &&
+                        dconf->async_delay >= 0);
     req->rb_method = RB_INIT;
 
+    ap_set_module_config(r->request_config, &proxy_http_module, req);
+
     if (apr_table_get(r->subprocess_env, "force-proxy-request-1.0")) {
         req->force10 = 1;
     }
@@ -2004,9 +2057,9 @@ static int proxy_http_handler(request_rec *r, proxy_worker *worker,
         }
     }
 
-    if (req->can_go_async || req->upgrade) {
+    if (req->can_suspend || req->upgrade) {
         /* If ProxyAsyncIdleTimeout is not set, use backend timeout */
-        if (req->can_go_async && dconf->async_idle_timeout_set) {
+        if (req->can_suspend && dconf->async_idle_timeout_set) {
             req->idle_timeout = dconf->async_idle_timeout;
         }
         else if (worker->s->timeout_set) {
@@ -2045,7 +2098,7 @@ static int proxy_http_handler(request_rec *r, proxy_worker *worker,
      * data to the backend ASAP?
      */
     if (input_brigade
-             || req->can_go_async
+             || req->can_suspend
              || req->do_100_continue
              || apr_table_get(r->subprocess_env,
                               "proxy-prefetch-nonblocking")) {
@@ -2190,13 +2243,18 @@ static int proxy_http_handler(request_rec *r, proxy_worker *worker,
 static int proxy_http_post_config(apr_pool_t *pconf, apr_pool_t *plog,
         apr_pool_t *ptemp, server_rec *s)
 {
-
     /* proxy_http_post_config() will be called twice during startup.  So, don't
      * set up the static data the 1st time through. */
     if (ap_state_query(AP_SQ_MAIN_STATE) == AP_SQ_MS_CREATE_PRE_CONFIG) {
         return OK;
     }
 
+#ifdef AP_MPMQ_CAN_POLL_SUSPENDED
+    if (ap_mpm_query(AP_MPMQ_CAN_POLL_SUSPENDED, &mpm_can_poll_suspended)) {
+        mpm_can_poll_suspended = 0;
+    }
+#endif
+
     ap_proxy_clear_connection_fn =
             APR_RETRIEVE_OPTIONAL_FN(ap_proxy_clear_connection);
     if (!ap_proxy_clear_connection_fn) {
@@ -2214,6 +2272,10 @@ static void ap_proxy_http_register_hook(apr_pool_t *p)
     proxy_hook_scheme_handler(proxy_http_handler, NULL, NULL, APR_HOOK_FIRST);
     proxy_hook_canon_handler(proxy_http_canon, NULL, NULL, APR_HOOK_FIRST);
     warn_rx = ap_pregcomp(p, "[0-9]{3}[ \t]+[^ \t]+[ \t]+\"[^\"]*\"([ \t]+\"([^\"]+)\")?", 0);
+
+    /* For when the tunnel connections are suspended to and resumed from the MPM */
+    ap_hook_suspend_connection(proxy_http_suspend_connection, NULL, NULL, APR_HOOK_FIRST);
+    ap_hook_resume_connection(proxy_http_resume_connection, NULL, NULL, APR_HOOK_FIRST);
 }
 
 AP_DECLARE_MODULE(proxy_http) = {
diff --git a/modules/proxy/mod_proxy_wstunnel.c b/modules/proxy/mod_proxy_wstunnel.c
index 0e5e6cb8128..3439b08b18d 100644
--- a/modules/proxy/mod_proxy_wstunnel.c
+++ b/modules/proxy/mod_proxy_wstunnel.c
@@ -17,13 +17,15 @@
 #include "mod_proxy.h"
 #include "http_config.h"
 #include "ap_mpm.h"
+#include "mpm_common.h"
 
 module AP_MODULE_DECLARE_DATA proxy_wstunnel_module;
 
+static int mpm_can_poll_suspended = 0;
+
 typedef struct {
     unsigned int fallback_to_proxy_http     :1,
                  fallback_to_proxy_http_set :1;
-    int mpm_can_poll;
     apr_time_t idle_timeout;
     apr_time_t async_delay;
 } proxyws_dir_conf;
@@ -32,83 +34,130 @@ typedef struct ws_baton_t {
     request_rec *r;
     proxy_conn_rec *backend;
     proxy_tunnel_rec *tunnel;
+    apr_time_t idle_timeout;
     apr_pool_t *async_pool;
     const char *scheme;
+    int suspended;
 } ws_baton_t;
 
 static int can_fallback_to_proxy_http;
 
-static void proxy_wstunnel_callback(void *b);
-
-static int proxy_wstunnel_pump(ws_baton_t *baton, int async)
+static int proxy_wstunnel_pump(ws_baton_t *baton)
 {
     int status = ap_proxy_tunnel_run(baton->tunnel);
     if (status == HTTP_GATEWAY_TIME_OUT) {
-        if (!async) {
+        if (!mpm_can_poll_suspended) {
             /* ap_proxy_tunnel_run() didn't log this */
             ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, baton->r, APLOGNO(10225)
-                          "Tunnel timed out");
+                          "proxy: %s tunneling timed out",
+                          baton->scheme);
         }
         else {
-            ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, baton->r, APLOGNO(02542)
-                          "Attempting to go async");
             status = SUSPENDED;
         }
     }
     return status;
 }
 
-static void proxy_wstunnel_finish(ws_baton_t *baton)
-{ 
-    ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, baton->r, "proxy_wstunnel_finish");
-    ap_proxy_release_connection(baton->scheme, baton->backend, baton->r->server);
-    ap_finalize_request_protocol(baton->r);
-    ap_lingering_close(baton->r->connection);
-    ap_mpm_resume_suspended(baton->r->connection);
-    ap_process_request_after_handler(baton->r); /* don't touch baton or r after here */
+/* The backend and SUSPENDED client connections are done,
+ * release them (the latter in the MPM).
+ */
+static void proxy_wstunnel_done(ws_baton_t *baton, int cancelled)
+{
+    request_rec *r = baton->r;
+    conn_rec *c = r->connection;
+    proxy_conn_rec *backend = baton->backend;
+
+    ap_log_rerror(APLOG_MARK, APLOG_TRACE5, 0, r, "proxy %s: %s async",
+                  baton->scheme, cancelled ? "cancel" : "finish");
+
+    /* Upgraded connections not reusable */
+    c->keepalive = AP_CONN_CLOSE;
+    backend->close = 1;
+
+    ap_proxy_release_connection(baton->scheme, backend, r->server);
+
+    ap_finalize_request_protocol(r);
+    ap_process_request_after_handler(r);
+    /* don't dereference baton or r from here! */
+
+    /* Return the client connection to the MPM */
+    c->cs->state = CONN_STATE_LINGER;
+    ap_mpm_resume_suspended(c);
 }
 
-/* If neither socket becomes readable in the specified timeout,
- * this callback will kill the request.  We do not have to worry about
- * having a cancel and a IO both queued.
- */
-static void proxy_wstunnel_cancel_callback(void *b)
-{ 
-    ws_baton_t *baton = (ws_baton_t*)b;
-    ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, baton->r,
-                  "proxy_wstunnel_cancel_callback, IO timed out");
-    proxy_wstunnel_finish(baton);
+/* Tell the MPM to poll the connections and resume when ready */
+static void proxy_wstunnel_poll(ws_baton_t *baton)
+{
+    request_rec *r = baton->r;
+    conn_rec *c = r->connection;
+
+    ap_log_rerror(APLOG_MARK, APLOG_TRACE5, 0, r,
+                  "proxy %s: going async", baton->scheme);
+
+    /* Create/clear the subpool used by the MPM to allocate
+     * the temporary data needed for this polling.
+     */
+    if (baton->async_pool) {
+        apr_pool_clear(baton->async_pool);
+    }
+    else {
+        apr_pool_create(&baton->async_pool, r->pool);
+    }
+
+    c->cs->state = CONN_STATE_SUSPENDED;
+    ap_mpm_poll_suspended(c, baton->async_pool, baton->tunnel->pfds,
+                          baton->idle_timeout);
 }
 
-/* Invoked by the event loop when data is ready on either end. 
- *  Pump both ends until they'd block and then start over again 
- *  We don't need the invoke_mtx, since we never put multiple callback events
- *  in the queue.
- */
-static void proxy_wstunnel_callback(void *b)
-{ 
-    ws_baton_t *baton = (ws_baton_t*)b;
+/* The resume_connection hook called by the MPM when polling completes (or times out) */
+static void proxy_wstunnel_resume_connection(conn_rec *c, request_rec *r)
+{
+    ws_baton_t *baton = NULL;
+    int status;
 
-    /* Clear MPM's temporary data */
-    AP_DEBUG_ASSERT(baton->async_pool != NULL);
-    apr_pool_clear(baton->async_pool);
+    if (r) {
+        baton = ap_get_module_config(r->request_config, &proxy_wstunnel_module);
+    }
+    if (!baton || !baton->suspended) {
+        return;
+    }
+    ap_assert(baton->r == r);
 
-    if (proxy_wstunnel_pump(baton, 1) == SUSPENDED) {
-        proxyws_dir_conf *dconf = ap_get_module_config(baton->r->per_dir_config,
-                                                       &proxy_wstunnel_module);
+    if (c->cs->state == CONN_STATE_SUSPENDED) {
+        status = proxy_wstunnel_pump(baton);
+    }
+    else {
+        AP_DEBUG_ASSERT(c->cs->state == CONN_STATE_LINGER);
+        ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO()
+                      "proxy: %s async tunneling timed out (state %i)",
+                      baton->scheme, c->cs->state);
+        status = DONE;
+    }
+    if (status == SUSPENDED) {
+        /* Keep polling in the MPM */
+        proxy_wstunnel_poll(baton);
+    }
+    else {
+        /* Done with tunneling */
+        proxy_wstunnel_done(baton, status != OK);
+    }
+}
 
-        ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, baton->r,
-                      "proxy_wstunnel_callback suspend");
+/* The suspend_connection hook called once the MPM gets the SUSPENDED connection */
+static void proxy_wstunnel_suspend_connection(conn_rec *c, request_rec *r)
+{
+    ws_baton_t *baton = NULL;
 
-        ap_mpm_register_poll_callback_timeout(baton->async_pool,
-                                              baton->tunnel->pfds,
-                                              proxy_wstunnel_callback, 
-                                              proxy_wstunnel_cancel_callback, 
-                                              baton, dconf->idle_timeout);
+    if (r) {
+        baton = ap_get_module_config(r->request_config, &proxy_wstunnel_module);
     }
-    else { 
-        proxy_wstunnel_finish(baton);
+    if (!baton || !baton->suspended) {
+        return;
     }
+    ap_assert(baton->r == r);
+
+    proxy_wstunnel_poll(baton);
 }
 
 static int proxy_wstunnel_check_trans(request_rec *r, const char *url)
@@ -296,51 +345,35 @@ static int proxy_wstunnel_request(apr_pool_t *p, request_rec *r,
                       "error creating websocket tunnel");
         return HTTP_INTERNAL_SERVER_ERROR;
     }
+    if (mpm_can_poll_suspended) {
+        tunnel->timeout = dconf->async_delay;
+    }  
+    else { 
+        tunnel->timeout = dconf->idle_timeout;
+    }
 
     baton = apr_pcalloc(r->pool, sizeof(*baton));
     baton->r = r;
     baton->backend = conn;
     baton->tunnel = tunnel;
     baton->scheme = scheme;
-
-    if (!dconf->mpm_can_poll) {
-        tunnel->timeout = dconf->idle_timeout;
-        status = proxy_wstunnel_pump(baton, 0);
-    }  
-    else { 
-        tunnel->timeout = dconf->async_delay;
-        status = proxy_wstunnel_pump(baton, 1);
-        if (status == SUSPENDED) {
-            /* Create the subpool used by the MPM to alloc its own
-             * temporary data, which we want to clear on the next
-             * round (above) to avoid leaks.
-             */
-            apr_pool_create(&baton->async_pool, baton->r->pool);
-
-            rv = ap_mpm_register_poll_callback_timeout(
-                         baton->async_pool,
-                         baton->tunnel->pfds,
-                         proxy_wstunnel_callback, 
-                         proxy_wstunnel_cancel_callback, 
-                         baton, 
-                         dconf->idle_timeout);
-            if (rv == APR_SUCCESS) { 
-                return SUSPENDED;
-            }
-
-            if (APR_STATUS_IS_ENOTIMPL(rv)) { 
-                ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, r, APLOGNO(02544) "No async support");
-                tunnel->timeout = dconf->idle_timeout;
-                status = proxy_wstunnel_pump(baton, 0); /* force no async */
-            }
-            else { 
-                ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, r, APLOGNO(10211)
-                              "error registering websocket tunnel");
-                status = HTTP_INTERNAL_SERVER_ERROR;
-            }
-        }
+    baton->idle_timeout = dconf->idle_timeout;
+    ap_set_module_config(r->request_config, &proxy_wstunnel_module, baton);
+
+    status = proxy_wstunnel_pump(baton);
+    if (status == SUSPENDED) {
+        /* Let the MPM call proxy_wstunnel_suspend_connection() when
+         * the connection is returned to it (i.e. not handled anywhere
+         * else anymore). This prevents the connection from being seen
+         * or handled by multiple threads at the same time, which could
+         * happen if we'd call ap_mpm_poll_suspended() directly from
+         * here, during the time for the connection to actually reaches
+         * the MPM whilst a new IO causes the connection to be
+         * rescheduled quickly.
+         */
+        baton->suspended = 1;
+        return SUSPENDED;
     }
-
     if (ap_is_HTTP_ERROR(status)) {
         /* Don't send an error page down an upgraded connection */
         if (!tunnel->replied) {
@@ -462,8 +495,6 @@ static void *create_proxyws_dir_config(apr_pool_t *p, char *dummy)
     new->fallback_to_proxy_http = 1;
     new->idle_timeout = -1; /* no timeout */
 
-    ap_mpm_query(AP_MPMQ_CAN_POLL, &new->mpm_can_poll);
-
     return (void *) new;
 }
 
@@ -477,7 +508,6 @@ static void *merge_proxyws_dir_config(apr_pool_t *p, void *vbase, void *vadd)
                                   : base->fallback_to_proxy_http;
     new->fallback_to_proxy_http_set = (add->fallback_to_proxy_http_set
                                        || base->fallback_to_proxy_http_set);
-    new->mpm_can_poll = add->mpm_can_poll;
     new->idle_timeout = add->idle_timeout;
     new->async_delay = add->async_delay;
 
@@ -514,6 +544,12 @@ static int proxy_wstunnel_post_config(apr_pool_t *pconf, apr_pool_t *plog,
     can_fallback_to_proxy_http =
         (ap_find_linked_module("mod_proxy_http.c") != NULL);
 
+#ifdef AP_MPMQ_CAN_POLL_SUSPENDED
+    if (ap_mpm_query(AP_MPMQ_CAN_POLL_SUSPENDED, &mpm_can_poll_suspended)) {
+        mpm_can_poll_suspended = 0;
+    }
+#endif
+
     return OK;
 }
 
@@ -542,6 +578,10 @@ static void ws_proxy_hooks(apr_pool_t *p)
     proxy_hook_scheme_handler(proxy_wstunnel_handler, NULL, aszSucc, APR_HOOK_FIRST);
     proxy_hook_check_trans(proxy_wstunnel_check_trans, NULL, aszSucc, APR_HOOK_MIDDLE);
     proxy_hook_canon_handler(proxy_wstunnel_canon, NULL, aszSucc, APR_HOOK_FIRST);
+
+    /* For when the tunnel connections are suspended to and resumed from the MPM */
+    ap_hook_suspend_connection(proxy_wstunnel_suspend_connection, NULL, NULL, APR_HOOK_FIRST);
+    ap_hook_resume_connection(proxy_wstunnel_resume_connection, NULL, NULL, APR_HOOK_FIRST);
 }
 
 AP_DECLARE_MODULE(proxy_wstunnel) = {
diff --git a/modules/proxy/proxy_util.c b/modules/proxy/proxy_util.c
index 88d174220d8..52595a03ec5 100644
--- a/modules/proxy/proxy_util.c
+++ b/modules/proxy/proxy_util.c
@@ -5898,7 +5898,7 @@ PROXY_DECLARE(int) ap_proxy_tunnel_run(proxy_tunnel_rec *tunnel)
                     ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(10221)
                                   "proxy: %s: %s flushing failed (%i)",
                                   scheme, out->name, rc);
-                    status = rc;
+                    status = HTTP_BAD_GATEWAY;
                     goto done;
                 }
 
diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c
index e0ba249bbf7..1a71f214c8c 100644
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -246,6 +246,8 @@ typedef struct event_srv_cfg_s event_srv_cfg;
 struct timeout_queue;
 static apr_thread_mutex_t *timeout_mutex;
 
+struct user_poll_baton;
+
 /*
  * The pollset for sockets that are in any of the timeout queues. Currently
  * we use the timeout_mutex to make sure that connections are added/removed
@@ -297,6 +299,8 @@ struct event_conn_state_t {
     struct timeout_queue *q;
     /** the timer event for this entry */
     timer_event_t *te;
+    /** user pollfds (for suspended connection) */
+    struct user_poll_baton *user_baton;
 
     /*
      * when queued to workers
@@ -317,6 +321,8 @@ struct event_conn_state_t {
          *  hooks)
          */
         suspended       :1,
+        /** Did the connection timed out? */
+        timed_out       :1,
         /** Is lingering close from defer_lingering_close()? */
         deferred_linger :1,
         /** Has ap_start_lingering_close() been called? */
@@ -497,6 +503,15 @@ static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *cs)
     apr_time_t elem_expiry;
     apr_time_t next_expiry;
 
+    /* It greatly simplifies the logic to use a single timeout value per q
+     * because the new element can just be added to the end of the list and
+     * it will stay sorted in expiration time sequence.  If brand new
+     * sockets are sent to the event thread for a readability check, this
+     * will be a slight behavior change - they use the non-keepalive
+     * timeout today.  With a normal client, the socket will be readable in
+     * a few milliseconds anyway.
+     */
+
     ap_assert(q && !cs->q);
 
     cs->q = q;
@@ -619,14 +634,14 @@ typedef struct
     void *baton;
 } listener_poll_type;
 
-typedef struct socket_callback_baton
-{
-    ap_mpm_callback_fn_t *cbfunc;
-    void *user_baton;
+struct user_poll_baton {
+    apr_pool_t *pool;
+    event_conn_state_t *cs;
     apr_array_header_t *pfds;
+    apr_thread_mutex_t *mutex; /* pfds added/removed atomically */
     timer_event_t *cancel_event; /* If a timeout was requested, a pointer to the timer event */
-    struct socket_callback_baton *next;
-} socket_callback_baton_t;
+    struct user_poll_baton *next; /* chaining */
+};
 
 typedef struct event_child_bucket {
     ap_pod_t *pod;
@@ -1120,6 +1135,9 @@ static int event_query(int query_code, int *result, apr_status_t *rv)
     case AP_MPMQ_CAN_WAITIO:
         *result = 1;
         break;
+    case AP_MPMQ_CAN_POLL_SUSPENDED:
+        *result = 1;
+        break;
     default:
         *rv = APR_ENOTIMPL;
         break;
@@ -1223,11 +1241,8 @@ static apr_status_t decrement_connection_count(void *cs_)
                  "connection %" CS_FMT_TO " cleaned up",
                  CS_ARG_TO(cs));
 
-    switch (cs->pub.state) {
-    case CONN_STATE_SUSPENDED:
+    if (cs->suspended) {
         apr_atomic_dec32(&suspended_count);
-    default:
-        break;
     }
 
     /* Unblock the listener if it's waiting for connection_count = 0,
@@ -1250,15 +1265,24 @@ static apr_status_t decrement_connection_count(void *cs_)
 
 static void notify_suspend(event_conn_state_t *cs)
 {
-    ap_run_suspend_connection(cs->c, cs->r);
-    cs->c->sbh = NULL;
+    AP_DEBUG_ASSERT(!cs->suspended);
+    ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
+                  "Suspend connection %" CS_FMT, CS_ARG(cs));
+    apr_atomic_inc32(&suspended_count);
     cs->suspended = 1;
+
+    cs->c->sbh = NULL;
+    cs->c->suspended_baton = cs;
+    ap_run_suspend_connection(cs->c, cs->r);
 }
 
-static void notify_resume(event_conn_state_t *cs, int cleanup)
+static void notify_resume(event_conn_state_t *cs)
 {
-    cs->suspended = 0;
-    cs->c->sbh = cleanup ? NULL : cs->sbh;
+    AP_DEBUG_ASSERT(cs->suspended);
+    ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
+                  "Resume connection %" CS_FMT, CS_ARG(cs));
+
+    cs->c->sbh = cs->sbh;
     ap_run_resume_connection(cs->c, cs->r);
 }
 
@@ -1360,12 +1384,13 @@ static void shutdown_connection(event_conn_state_t *cs, apr_time_t now,
  * if the connection is currently suspended as far as modules
  * know, provide notification of resumption.
  */
-static apr_status_t ptrans_pre_cleanup(void *dummy)
+static apr_status_t ptrans_pre_cleanup(void *arg)
 {
-    event_conn_state_t *cs = dummy;
-
+    event_conn_state_t *cs = arg;
     if (cs->suspended) {
-        notify_resume(cs, 1);
+        cs->sbh = NULL;
+        cs->pub.state = CONN_STATE_LINGER;
+        notify_resume(cs);
     }
     return APR_SUCCESS;
 }
@@ -1440,7 +1465,8 @@ static int pollset_add_at(event_conn_state_t *cs, int sense,
                   (int)cs->pfd.reqevents,
                   CS_ARG(cs), at, line);
 
-    ap_assert(cs->q == NULL && cs->te == NULL && ((q != NULL) ^ (te != NULL)));
+    ap_assert((q != NULL) ^ (te != NULL));
+    ap_assert(cs->q == NULL && cs->te == NULL);
 
     set_conn_state_sense(cs, sense);
 
@@ -1497,8 +1523,6 @@ static int pollset_del_at(event_conn_state_t *cs, int locked,
                   (int)cs->pfd.reqevents,
                   CS_ARG(cs), at, line);
 
-    ap_assert((cs->q != NULL) ^ (cs->te != NULL));
-
     if (cs->q) {
         if (!locked) {
             apr_thread_mutex_lock(timeout_mutex);
@@ -1508,7 +1532,7 @@ static int pollset_del_at(event_conn_state_t *cs, int locked,
             apr_thread_mutex_unlock(timeout_mutex);
         }
     }
-    else {
+    else if (cs->te) {
         cs->te->canceled = 1;
         cs->te = NULL;
     }
@@ -1537,8 +1561,7 @@ static int pollset_del_at(event_conn_state_t *cs, int locked,
 /* Forward declare */
 static timer_event_t *get_timer_event(apr_time_t timeout,
                                       ap_mpm_callback_fn_t *cbfn, void *baton,
-                                      int insert,
-                                      apr_array_header_t *pfds);
+                                      int insert);
 static void process_lingering_close(event_conn_state_t *cs);
 
 static event_conn_state_t *make_conn_state(apr_pool_t *p, apr_socket_t *csd)
@@ -1640,22 +1663,28 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
             close_connection(cs);
             return;
         }
-
-        cs->pub.sense = CONN_SENSE_DEFAULT;
     }
     else { /* The connection is scheduled back */
         c = cs->c;
         c->current_thread = thd;
         c->id = conn_id; /* thread number is part of ID */
         ap_update_sb_handle(cs->sbh, my_child_num, my_thread_num);
-        notify_resume(cs, 0);
+    }
+
+    /* Suspended connections hooks run here and don't fall through */
+    if (cs->suspended) {
+        ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
+                      "resuming connection %" CS_FMT, CS_ARG(cs));
+        notify_resume(cs);
+        return;
     }
 
     ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
                   "processing connection %" CS_FMT " (aborted %d, clogging %d)",
                   CS_ARG(cs), c->aborted, c->clogging_input_filters);
 
-    if (cs->pub.state == CONN_STATE_LINGER) {
+    if (cs->pub.state == CONN_STATE_LINGER || c->aborted) {
+        cs->pub.state = CONN_STATE_LINGER;
         goto lingering_close;
     }
 
@@ -1697,16 +1726,15 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
          * worker or prefork MPMs for instance.
          */
         switch (rc) {
-        case DONE:
-            rc = OK; /* same as OK, fall through */
         case OK:
+        case DONE: /* same as OK, fall through */
             if (cs->pub.state == CONN_STATE_PROCESSING) {
                 cs->pub.state = CONN_STATE_LINGER;
             }
             else if (cs->pub.state == CONN_STATE_KEEPALIVE) {
                 cs->pub.state = CONN_STATE_WRITE_COMPLETION;
             }
-            break;
+            rc = OK;
         }
         if (rc != OK || (cs->pub.state != CONN_STATE_LINGER
                          && cs->pub.state != CONN_STATE_ASYNC_WAITIO
@@ -1735,7 +1763,6 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
          * event thread poll for read/writeability.
          */
         ap_update_child_status(cs->sbh, SERVER_BUSY_READ, NULL);
-        notify_suspend(cs);
 
         /* If the connection timeout is actually different than the waitio_q's,
          * use a timer event to honor it (e.g. mod_reqtimeout may enforce its
@@ -1747,7 +1774,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
             if (timeout < TIMERS_FUDGE_TIMEOUT) {
                 timeout = TIMERS_FUDGE_TIMEOUT;
             }
-            te = get_timer_event(timeout, NULL, cs, 1, NULL);
+            te = get_timer_event(timeout, NULL, cs, 1);
         }
         else {
             q = cs->sc->io_q;
@@ -1776,7 +1803,6 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
         }
         if (pending == AGAIN) {
             /* Let the event thread poll for write */
-            notify_suspend(cs);
             cs->pub.sense = CONN_SENSE_DEFAULT;
             if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q, NULL)) {
                 return; /* queued */
@@ -1804,16 +1830,7 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
     if (cs->pub.state == CONN_STATE_KEEPALIVE) {
         ap_update_child_status(cs->sbh, SERVER_BUSY_KEEPALIVE, NULL);
 
-        /* It greatly simplifies the logic to use a single timeout value per q
-         * because the new element can just be added to the end of the list and
-         * it will stay sorted in expiration time sequence.  If brand new
-         * sockets are sent to the event thread for a readability check, this
-         * will be a slight behavior change - they use the non-keepalive
-         * timeout today.  With a normal client, the socket will be readable in
-         * a few milliseconds anyway.
-         */
-        notify_suspend(cs);
-
+        cs->pub.sense = CONN_SENSE_DEFAULT;
         if (!pollset_add(cs, CONN_SENSE_WANT_READ, cs->ka_sc->ka_q, NULL)) {
             cs->pub.state = CONN_STATE_LINGER;
             goto lingering_close;
@@ -1823,33 +1840,149 @@ static void process_socket(apr_thread_t *thd, apr_pool_t *p,
     }
 
     if (cs->pub.state == CONN_STATE_SUSPENDED) {
-        cs->c->suspended_baton = cs;
-        apr_atomic_inc32(&suspended_count);
         notify_suspend(cs);
-        return; /* done */
+        return; /* suspended */
     }
 
  lingering_close:
     process_lingering_close(cs);
 }
 
+static apr_status_t user_poll_cleanup(void *data)
+{
+    struct user_poll_baton *user_baton = data;
+    apr_array_header_t *pfds = user_baton->pfds;
+    apr_status_t rc, final_rc = APR_SUCCESS;
+    int i;
+
+    /* All the pollfds should be added/removed atomically, so synchronize
+     * with register_user_poll().
+     */
+    apr_thread_mutex_lock(user_baton->mutex);
+    for (i = 0; i < pfds->nelts; i++) {
+        apr_pollfd_t *pfd = (apr_pollfd_t *)pfds->elts + i;
+        if (pfd->client_data) {
+            rc = apr_pollset_remove(event_pollset, pfd);
+            if (rc != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rc)) {
+                final_rc = rc;
+            }
+            pfd->client_data = NULL;
+        }
+    }
+    apr_thread_mutex_unlock(user_baton->mutex);
+
+    if (final_rc) {
+        AP_DEBUG_ASSERT(0);
+        signal_threads(ST_GRACEFUL);
+    }
+    return final_rc;
+}
+
+/* Put some user pollfds into the listener pollset for a SUSPENDED connection */
+static apr_status_t event_poll_suspended(conn_rec *c, apr_pool_t *p,
+                                         const apr_array_header_t *user_pfds,
+                                         apr_interval_time_t timeout)
+{
+    event_conn_state_t *cs = c->suspended_baton;
+    apr_status_t rc, final_rc = APR_SUCCESS;
+    struct user_poll_baton *user_baton;
+    apr_array_header_t *pfds;
+    listener_poll_type *pt;
+    int i;
+
+    AP_DEBUG_ASSERT(cs != NULL);
+    AP_DEBUG_ASSERT(cs->suspended);
+    AP_DEBUG_ASSERT(user_pfds->nelts > 0);
+    if (cs == NULL) {
+        ap_log_cerror (APLOG_MARK, LOG_WARNING, 0, c, APLOGNO()
+                "event_poll_suspended: suspended_baton is NULL");
+        return APR_EINVAL;
+    }
+    if (!cs->suspended) {
+        ap_log_cerror (APLOG_MARK, LOG_WARNING, 0, c, APLOGNO()
+                "event_poll_suspended: thread isn't suspended");
+        return APR_EINVAL;
+    }
+    if (user_pfds->nelts <= 0) {
+        ap_log_cerror (APLOG_MARK, LOG_WARNING, 0, c, APLOGNO()
+                "event_poll_suspended: no poll FDs");
+        return APR_EINVAL;
+    }
+ 
+    cs->pub.state = CONN_STATE_SUSPENDED;
+    cs->user_baton = user_baton = apr_pcalloc(p, sizeof(*user_baton));
+    apr_thread_mutex_create(&user_baton->mutex, APR_THREAD_MUTEX_DEFAULT, p);
+    user_baton->pfds = pfds = apr_array_copy(p, user_pfds);
+    user_baton->pool = p;
+    user_baton->cs = cs;
+
+    apr_pool_pre_cleanup_register(p, user_baton, user_poll_cleanup);
+
+    pt = apr_pcalloc(p, sizeof(*pt));
+    pt->baton = user_baton;
+    pt->type = PT_USER;
+
+    if (timeout >= 0) {
+        /* Prevent the timer from firing before the pollset is updated */
+        if (timeout < TIMERS_FUDGE_TIMEOUT) {
+            timeout = TIMERS_FUDGE_TIMEOUT;
+        }
+        user_baton->cancel_event = get_timer_event(timeout, NULL, cs, 1);
+    }
+    cs->te = user_baton->cancel_event;
+
+    /* All the pollfds should be added/removed atomically, so synchronize
+     * with user_poll_cleanup().
+     */
+    apr_thread_mutex_lock(user_baton->mutex);
+    for (i = 0; i < pfds->nelts; i++) {
+        apr_pollfd_t *pfd = (apr_pollfd_t *)pfds->elts + i;
+        if (pfd->reqevents) {
+            if (pfd->reqevents & APR_POLLIN) {
+                pfd->reqevents |= APR_POLLHUP;
+            }
+            pfd->reqevents |= APR_POLLERR;
+            pfd->client_data = pt;
+
+            rc = apr_pollset_add(event_pollset, pfd);
+            if (rc != APR_SUCCESS) {
+                final_rc = rc;
+            }
+        }
+        else {
+            pfd->client_data = NULL;
+        }
+    }
+    apr_thread_mutex_unlock(user_baton->mutex);
+
+    if (final_rc) {
+        AP_DEBUG_ASSERT(0);
+        signal_threads(ST_GRACEFUL);
+    }
+    return final_rc;
+}
+
 /* Put a SUSPENDED connection back into a queue. */
-static apr_status_t event_resume_suspended (conn_rec *c)
+static apr_status_t event_resume_suspended(conn_rec *c)
 {
-    event_conn_state_t* cs = (event_conn_state_t*) c->suspended_baton;
+    event_conn_state_t *cs = c->suspended_baton;
+
+    AP_DEBUG_ASSERT(cs != NULL);
+    AP_DEBUG_ASSERT(cs->suspended);
     if (cs == NULL) {
         ap_log_cerror (APLOG_MARK, LOG_WARNING, 0, c, APLOGNO(02615)
                 "event_resume_suspended: suspended_baton is NULL");
-        return APR_EGENERAL;
+        return APR_EINVAL;
     }
     if (!cs->suspended) {
         ap_log_cerror (APLOG_MARK, LOG_WARNING, 0, c, APLOGNO(02616)
-                "event_resume_suspended: Thread isn't suspended");
-        return APR_EGENERAL;
+                "event_resume_suspended: thread isn't suspended");
+        return APR_EINVAL;
     }
-
     apr_atomic_dec32(&suspended_count);
-    c->suspended_baton = NULL;
+    cs->c->suspended_baton = NULL;
+    cs->c->sbh = cs->sbh;
+    cs->suspended = 0;
 
     cs->pub.sense = CONN_SENSE_DEFAULT;
     if (cs->pub.state != CONN_STATE_LINGER) {
@@ -1857,7 +1990,6 @@ static apr_status_t event_resume_suspended (conn_rec *c)
         if (pollset_add(cs, CONN_SENSE_WANT_WRITE, cs->sc->wc_q, NULL)) {
             return APR_SUCCESS; /* queued */
         }
-
         /* fall through lingering close on error */
         cs->pub.state = CONN_STATE_LINGER;
     }
@@ -2150,8 +2282,7 @@ static apr_thread_mutex_t *g_timer_skiplist_mtx;
 
 static timer_event_t *get_timer_event(apr_time_t timeout,
                                       ap_mpm_callback_fn_t *cbfn, void *baton,
-                                      int insert,
-                                      apr_array_header_t *pfds)
+                                      int insert)
 {
     timer_event_t *te;
     apr_time_t now = (timeout < 0) ? 0 : event_time_now();
@@ -2179,7 +2310,6 @@ static timer_event_t *get_timer_event(apr_time_t timeout,
     te->baton = baton;
     te->when = now + timeout;
     te->timeout = timeout;
-    te->pfds = pfds;
 
     if (insert) {
         apr_time_t next_expiry;
@@ -2219,122 +2349,15 @@ static void put_timer_event(timer_event_t *te, int locked)
     }
 }
 
-static apr_status_t event_register_timed_callback_ex(apr_time_t timeout,
-                                                  ap_mpm_callback_fn_t *cbfn,
-                                                  void *baton,
-                                                  apr_array_header_t *pfds)
-{
-    if (!cbfn) {
-        return APR_EINVAL;
-    }
-    get_timer_event(timeout, cbfn, baton, 1, pfds);
-    return APR_SUCCESS;
-}
-
 static apr_status_t event_register_timed_callback(apr_time_t timeout,
                                                   ap_mpm_callback_fn_t *cbfn,
                                                   void *baton)
 {
-    event_register_timed_callback_ex(timeout, cbfn, baton, NULL);
-    return APR_SUCCESS;
-}
-
-static apr_status_t event_cleanup_poll_callback(void *data)
-{
-    apr_status_t final_rc = APR_SUCCESS;
-    apr_array_header_t *pfds = data;
-    int i;
-
-    for (i = 0; i < pfds->nelts; i++) {
-        apr_pollfd_t *pfd = (apr_pollfd_t *)pfds->elts + i;
-        if (pfd->client_data) {
-            apr_status_t rc;
-            rc = apr_pollset_remove(event_pollset, pfd);
-            if (rc != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rc)) {
-                final_rc = rc;
-            }
-            pfd->client_data = NULL;
-        }
-    }
-
-    if (final_rc) {
-        AP_DEBUG_ASSERT(0);
-        signal_threads(ST_GRACEFUL);
-    }
-    return final_rc;
-}
-
-static apr_status_t event_register_poll_callback_ex(apr_pool_t *p,
-                                                const apr_array_header_t *pfds,
-                                                ap_mpm_callback_fn_t *cbfn,
-                                                ap_mpm_callback_fn_t *tofn,
-                                                void *baton,
-                                                apr_time_t timeout)
-{
-    listener_poll_type *pt;
-    socket_callback_baton_t *scb;
-    apr_status_t rc, final_rc = APR_SUCCESS;
-    int i;
-
-    if (!cbfn || !tofn) {
+    if (!cbfn) {
         return APR_EINVAL;
     }
-
-    scb = apr_pcalloc(p, sizeof(*scb));
-    scb->cbfunc = cbfn;
-    scb->user_baton = baton;
-    scb->pfds = apr_array_copy(p, pfds);
-
-    pt = apr_palloc(p, sizeof(*pt));
-    pt->type = PT_USER;
-    pt->baton = scb;
-
-    apr_pool_pre_cleanup_register(p, scb->pfds, event_cleanup_poll_callback);
-
-    for (i = 0; i < scb->pfds->nelts; i++) {
-        apr_pollfd_t *pfd = (apr_pollfd_t *)scb->pfds->elts + i;
-        if (pfd->reqevents) {
-            if (pfd->reqevents & APR_POLLIN) {
-                pfd->reqevents |= APR_POLLHUP;
-            }
-            pfd->reqevents |= APR_POLLERR;
-            pfd->client_data = pt;
-        }
-        else {
-            pfd->client_data = NULL;
-        }
-    }
-
-    if (timeout > 0) {
-        /* Prevent the timer from firing before the pollset is updated */
-        if (timeout < TIMERS_FUDGE_TIMEOUT) {
-            timeout = TIMERS_FUDGE_TIMEOUT;
-        }
-        scb->cancel_event = get_timer_event(timeout, tofn, baton, 1, scb->pfds);
-    }
-    for (i = 0; i < scb->pfds->nelts; i++) {
-        apr_pollfd_t *pfd = (apr_pollfd_t *)scb->pfds->elts + i;
-        if (pfd->client_data) {
-            rc = apr_pollset_add(event_pollset, pfd);
-            if (rc != APR_SUCCESS) {
-                final_rc = rc;
-            }
-        }
-    }
-    return final_rc;
-}
-
-static apr_status_t event_register_poll_callback(apr_pool_t *p,
-                                                 const apr_array_header_t *pfds,
-                                                 ap_mpm_callback_fn_t *cbfn,
-                                                 void *baton)
-{
-    return event_register_poll_callback_ex(p,
-                                           pfds,
-                                           cbfn,
-                                           NULL, /* no timeout function */
-                                           baton,
-                                           0     /* no timeout */);
+    get_timer_event(timeout, cbfn, baton, 1);
+    return APR_SUCCESS;
 }
 
 /*
@@ -2363,11 +2386,9 @@ static void process_lingering_close(event_conn_state_t *cs)
         conn_rec *c = cs->c;
         int rc = OK;
 
-        cs->pub.state = CONN_STATE_LINGER;
-
         if (!cs->linger_started) {
             cs->linger_started = 1; /* once! */
-            notify_suspend(cs);
+            cs->pub.state = CONN_STATE_LINGER;
 
             /* Shutdown the connection, i.e. pre_connection_close hooks,
              * SSL/TLS close notify, WC bucket, etc..
@@ -2431,8 +2452,7 @@ static void process_lingering_close(event_conn_state_t *cs)
  * Pre-condition: timeout_mutex must already be locked
  */
 static unsigned int process_timeout_queue_ex(struct timeout_queue *queue,
-                                             apr_time_t now,
-                                             int shrink)
+                                             apr_time_t now, int shrink)
 {
     unsigned int count = 0;
     struct timeout_queue *q;
@@ -2466,6 +2486,7 @@ static unsigned int process_timeout_queue_ex(struct timeout_queue *queue,
                     break;
                 }
             }
+            cs->timed_out = 1;
 
             if (cs_in_backlog(cs)) {
                 /* Remove the backlog connection from worker_queue (note that
@@ -2473,8 +2494,8 @@ static unsigned int process_timeout_queue_ex(struct timeout_queue *queue,
                  * the backlog_q), and unreserve/set a worker/idler since
                  * none could handle the event.
                  */
-                ap_assert(cs_qe(cs)->cb_baton == cs);
                 ap_assert(cs->q == cs->sc->bl_q);
+                ap_assert(cs_qe(cs)->cb_baton == cs);
                 ap_queue_info_idlers_inc(worker_queue_info);
                 ap_queue_kill_event_locked(worker_queue, cs_qe(cs));
                 shutdown_connection(cs, now, 1);
@@ -2588,7 +2609,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
         apr_time_t next_expiry = -1;
         apr_interval_time_t timeout = -1;
         int workers_were_busy = 0, force_stats = 0;
-        socket_callback_baton_t *user_chain;
+        struct user_poll_baton *user_chain;
         const apr_pollfd_t *out_pfd;
         apr_time_t now, poll_time;
         event_conn_state_t *cs;
@@ -2653,24 +2674,54 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                     continue;
                 }
 
+                /* A timer without a callback is a cancel event for a cs in
+                 * either:
+                 *  1. CONN_STATE_ASYNC_WAITIO: the timer enforces a timeout
+                 *     different from the cs->sc->io_q's;
+                 *  2. CONN_STATE_SUSPENDED: the timer enforces a timeout for
+                 *     some user pollfds bound to the cs.
+                 * In both cases te->baton is the (timed out) cs.
+                 * For 1. we can shutdow the connection now, but for 2. we
+                 * need to resume the suspended connection in a worker thread
+                 * for the responsible module to know, which we do by setting
+                 * CONN_STATE_LINGER but also cs->timed_out to make sure that,
+                 * after the next/last ap_run_resume_connection(), this state
+                 * is maintained/restored to issue the actual close.
+                 */
                 if (!te->cbfunc) {
                     cs = te->baton;
+                    AP_DEBUG_ASSERT(cs != NULL);
+                    AP_DEBUG_ASSERT(cs->te == te);
                     put_timer_event(te, 1);
-                    ap_assert(cs && cs->te == te);
-                    ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
-                                  "timed out connection %" CS_FMT,
-                                  CS_ARG(cs));
-                    (void)pollset_del(cs, 0);
-                    kill_connection(cs, APR_TIMEUP);
-                    continue;
-                }
+                    cs->te = te = NULL;
+                    cs->timed_out = 1;
+
+                    if (!cs->user_baton) {
+                        ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, cs->c,
+                                      "timed out connection %" CS_FMT,
+                                      CS_ARG(cs));
+                        (void)pollset_del(cs, 0);
+                        shutdown_connection(cs, now, 0);
+                        continue;
+                    }
+
+                    /* Remove all user pollfds from the pollset */
+                    AP_DEBUG_ASSERT(cs->user_baton->pfds != NULL);
+                    apr_pool_cleanup_run(cs->user_baton->pool, cs->user_baton,
+                                         user_poll_cleanup);
+#ifdef AP_DEBUG
+                    memset(cs->user_baton, 0, sizeof(*cs->user_baton));
+#endif
+                    cs->user_baton = NULL;
 
-                if (te->pfds) {
-                    /* remove all sockets from the pollset */
-                    apr_pool_cleanup_run(te->pfds->pool, te->pfds,
-                                         event_cleanup_poll_callback);
+                    AP_DEBUG_ASSERT(cs->suspended);
+                    cs->pub.state = CONN_STATE_LINGER;
                 }
-                push2worker(NULL, te, now, &workers_were_busy);
+                else {
+                    cs = NULL;
+                }
+
+                push2worker(cs, te, now, &workers_were_busy);
             }
             if (te) {
                 next_expiry = te->when;
@@ -2778,7 +2829,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
 
         for (user_chain = NULL; num > 0; --num, ++out_pfd) {
             listener_poll_type *pt = out_pfd->client_data;
-            socket_callback_baton_t *baton;
+            struct user_poll_baton *user_baton;
 
             switch (pt->type) {
             case PT_CSD:
@@ -2894,13 +2945,13 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
             case PT_USER:
                 /* Multiple pfds of the same baton might trigger in this pass
                  * so chain once here and run the cleanup only after this loop
-                 * to avoid lifetime issues (i.e. pfds->pool cleared while some
-                 * of its pfd->client_data are still to be dereferenced here).
+                 * to avoid lifetime issues (i.e. user_baton->pool cleared while
+                 * some of its pfd->client_data are still to be dereferenced here).
                  */
-                baton = pt->baton;
-                if (baton != user_chain && !baton->next) {
-                    baton->next = user_chain;
-                    user_chain = baton;
+                user_baton = pt->baton;
+                if (user_baton != user_chain && !user_baton->next) {
+                    user_baton->next = user_chain;
+                    user_chain = user_baton;
                 }
                 break;
             }
@@ -2908,27 +2959,32 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
 
         /* Time to queue user callbacks chained above */
         while (user_chain) {
-            socket_callback_baton_t *baton = user_chain;
-            user_chain = user_chain->next;
-            baton->next = NULL;
+            struct user_poll_baton *user_baton = user_chain;
+            user_chain = user_baton->next;
+            user_baton->next = NULL;
+
+            cs = user_baton->cs;
+            AP_DEBUG_ASSERT(cs != NULL);
+            AP_DEBUG_ASSERT(cs->user_baton == user_baton);
+            AP_DEBUG_ASSERT(cs->te == user_baton->cancel_event);
+            AP_DEBUG_ASSERT(cs->pub.state == CONN_STATE_SUSPENDED);
+            AP_DEBUG_ASSERT(cs->suspended);
 
             /* Not expirable anymore */
-            if (baton->cancel_event) {
-                baton->cancel_event->canceled = 1;
-                baton->cancel_event = NULL;
+            if (cs->te) {
+                cs->te->canceled = 1;
+                cs->te = NULL;
             }
 
-            /* remove all sockets from the pollset */
-            apr_pool_cleanup_run(baton->pfds->pool, baton->pfds,
-                                 event_cleanup_poll_callback);
+            /* Remove all user pollfds from the pollset */
+            apr_pool_cleanup_run(user_baton->pool, user_baton,
+                                 user_poll_cleanup);
+#ifdef AP_DEBUG
+            memset(user_baton, 0, sizeof(*user_baton));
+#endif
 
-            /* masquerade as a timer event that is firing */
-            te = get_timer_event(-1 /* fake timer */,
-                                 baton->cbfunc,
-                                 baton->user_baton,
-                                 0, /* don't insert it */
-                                 NULL /* no associated socket callback */);
-            push2worker(NULL, te, now, &workers_were_busy);
+            /* Schedule ap_run_resume_connection() */
+            push2worker(cs, NULL, now, &workers_were_busy);
         }
 
         /* We process the timeout queues here only when the global
@@ -2959,6 +3015,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
              */
             process_timeout_queue(shutdown_q, now);
 
+            /* No specific requirement/order for those */
             process_timeout_queue(waitio_q, now);
             process_timeout_queue(write_completion_q, now);
             process_timeout_queue(keepalive_q, now);
@@ -4433,7 +4490,6 @@ static void setup_slave_conn(conn_rec *c, void *csd)
     cs = make_conn_state(c->pool, csd);
     cs->c = c;
     cs->sc = mcs->sc;
-    cs->suspended = 0;
     cs->bucket_alloc = c->bucket_alloc;
     cs->pfd = mcs->pfd;
     cs->pub = mcs->pub;
@@ -5085,14 +5141,11 @@ static void event_hooks(apr_pool_t * p)
     ap_hook_mpm_query(event_query, NULL, NULL, APR_HOOK_MIDDLE);
     ap_hook_mpm_register_timed_callback(event_register_timed_callback, NULL, NULL,
                                         APR_HOOK_MIDDLE);
-    ap_hook_mpm_register_poll_callback(event_register_poll_callback,
-                                       NULL, NULL, APR_HOOK_MIDDLE);
-    ap_hook_mpm_register_poll_callback_timeout(event_register_poll_callback_ex,
-                                               NULL, NULL, APR_HOOK_MIDDLE);
     ap_hook_pre_read_request(event_pre_read_request, NULL, NULL, APR_HOOK_MIDDLE);
     ap_hook_post_read_request(event_post_read_request, NULL, NULL, APR_HOOK_MIDDLE);
     ap_hook_mpm_get_name(event_get_name, NULL, NULL, APR_HOOK_MIDDLE);
     ap_hook_mpm_resume_suspended(event_resume_suspended, NULL, NULL, APR_HOOK_MIDDLE);
+    ap_hook_mpm_poll_suspended(event_poll_suspended, NULL, NULL, APR_HOOK_MIDDLE);
 
     ap_hook_pre_connection(event_pre_connection, NULL, NULL, APR_HOOK_REALLY_FIRST);
     ap_hook_protocol_switch(event_protocol_switch, NULL, NULL, APR_HOOK_REALLY_FIRST);
diff --git a/server/mpm_common.c b/server/mpm_common.c
index 2973bc9f4f2..d055fa2fd99 100644
--- a/server/mpm_common.c
+++ b/server/mpm_common.c
@@ -68,10 +68,9 @@
     APR_HOOK_LINK(mpm) \
     APR_HOOK_LINK(mpm_query) \
     APR_HOOK_LINK(mpm_register_timed_callback) \
-    APR_HOOK_LINK(mpm_register_poll_callback) \
-    APR_HOOK_LINK(mpm_register_poll_callback_timeout) \
     APR_HOOK_LINK(mpm_get_name) \
     APR_HOOK_LINK(mpm_resume_suspended) \
+    APR_HOOK_LINK(mpm_poll_suspended) \
     APR_HOOK_LINK(end_generation) \
     APR_HOOK_LINK(child_status) \
     APR_HOOK_LINK(output_pending) \
@@ -111,16 +110,11 @@ AP_IMPLEMENT_HOOK_RUN_FIRST(apr_status_t, mpm_register_timed_callback,
 AP_IMPLEMENT_HOOK_RUN_FIRST(apr_status_t, mpm_resume_suspended,
                             (conn_rec *c),
                             (c), APR_ENOTIMPL)
-AP_IMPLEMENT_HOOK_RUN_FIRST(apr_status_t, mpm_register_poll_callback,
-                            (apr_pool_t *p, const apr_array_header_t *pds,
-                             ap_mpm_callback_fn_t *cbfn, void *baton),
-                            (p, pds, cbfn, baton), APR_ENOTIMPL)
-AP_IMPLEMENT_HOOK_RUN_FIRST(apr_status_t, mpm_register_poll_callback_timeout,
-                            (apr_pool_t *p, const apr_array_header_t *pds,
-                             ap_mpm_callback_fn_t *cbfn,
-                             ap_mpm_callback_fn_t *tofn,
-                             void *baton, apr_time_t timeout),
-                            (p, pds, cbfn, tofn, baton, timeout), APR_ENOTIMPL)
+AP_IMPLEMENT_HOOK_RUN_FIRST(apr_status_t, mpm_poll_suspended,
+                            (conn_rec *c, apr_pool_t *p,
+                             const apr_array_header_t *pfds,
+                             apr_interval_time_t timeout),
+                            (c, p, pfds, timeout), APR_ENOTIMPL)
 AP_IMPLEMENT_HOOK_RUN_FIRST(int, output_pending,
                             (conn_rec *c), (c), DECLINED)
 AP_IMPLEMENT_HOOK_RUN_FIRST(int, input_pending,
@@ -573,26 +567,17 @@ AP_DECLARE(apr_status_t) ap_mpm_resume_suspended(conn_rec *c)
     return ap_run_mpm_resume_suspended(c);
 }
 
-AP_DECLARE(apr_status_t) ap_mpm_register_timed_callback(apr_time_t t,
-        ap_mpm_callback_fn_t *cbfn, void *baton)
+AP_DECLARE(apr_status_t) ap_mpm_poll_suspended(conn_rec *c, apr_pool_t *p,
+                                               const apr_array_header_t *pfds,
+                                               apr_interval_time_t timeout)
 {
-    return ap_run_mpm_register_timed_callback(t, cbfn, baton);
+    return ap_run_mpm_poll_suspended(c, p, pfds, timeout);
 }
 
-AP_DECLARE(apr_status_t) ap_mpm_register_poll_callback(
-        apr_pool_t *p, const apr_array_header_t *pfds,
+AP_DECLARE(apr_status_t) ap_mpm_register_timed_callback(apr_time_t t,
         ap_mpm_callback_fn_t *cbfn, void *baton)
 {
-    return ap_run_mpm_register_poll_callback(p, pfds, cbfn, baton);
-}
-
-AP_DECLARE(apr_status_t) ap_mpm_register_poll_callback_timeout(
-        apr_pool_t *p, const apr_array_header_t *pfds,
-        ap_mpm_callback_fn_t *cbfn, ap_mpm_callback_fn_t *tofn,
-        void *baton, apr_time_t timeout)
-{
-    return ap_run_mpm_register_poll_callback_timeout(p, pfds, cbfn, tofn,
-                                                     baton, timeout);
+    return ap_run_mpm_register_timed_callback(t, cbfn, baton);
 }
 
 AP_DECLARE(const char *)ap_show_mpm(void)
diff --git a/server/mpm_fdqueue.h b/server/mpm_fdqueue.h
index 29297fd60d5..4bb17c82955 100644
--- a/server/mpm_fdqueue.h
+++ b/server/mpm_fdqueue.h
@@ -89,7 +89,6 @@ struct timer_event_t
     ap_mpm_callback_fn_t *cbfunc;
     void *baton;
     int canceled;
-    apr_array_header_t *pfds;
     apr_interval_time_t timeout;
 };
 typedef struct timer_event_t timer_event_t;

From 92d0cdd150e7cab265dbd52b01a32954333eaf8a Mon Sep 17 00:00:00 2001
From: ylavic <ylavic.dev@gmail.com>
Date: Tue, 27 Jun 2023 01:54:48 +0200
Subject: [PATCH 22/22] mod_status: Be less racy, improve rendering, and show
 suspended connections.

---
 modules/generators/mod_status.c | 240 ++++++++++++++++++++------------
 modules/lua/lua_request.c       |   4 +-
 2 files changed, 154 insertions(+), 90 deletions(-)

diff --git a/modules/generators/mod_status.c b/modules/generators/mod_status.c
index 5ff635cc96e..20187af882b 100644
--- a/modules/generators/mod_status.c
+++ b/modules/generators/mod_status.c
@@ -71,6 +71,7 @@
 #define APR_WANT_STRFUNC
 #include "apr_want.h"
 #include "apr_strings.h"
+#include "apr_atomic.h"
 
 #define STATUS_MAXLINE 64
 
@@ -199,10 +200,15 @@ static int status_handler(request_rec *r)
     int short_report;
     int no_table_report;
     global_score *global_record;
-    worker_score *ws_record;
+    volatile process_score *ps;
     process_score *ps_record;
+    worker_score *ws_record;
     char *stat_buffer;
-    pid_t *pid_buffer, worker_pid;
+    pid_t worker_pid;
+    struct {
+        pid_t pid;
+        ap_generation_t gen;
+    } *proc_buffer;
     int *thread_idle_buffer = NULL;
     int *thread_graceful_buffer = NULL;
     int *thread_busy_buffer = NULL;
@@ -249,7 +255,7 @@ static int status_handler(request_rec *r)
         return HTTP_INTERNAL_SERVER_ERROR;
     }
 
-    pid_buffer = apr_palloc(r->pool, server_limit * sizeof(pid_t));
+    proc_buffer = apr_palloc(r->pool, server_limit * sizeof(*proc_buffer));
     stat_buffer = apr_palloc(r->pool, server_limit * thread_limit * sizeof(char));
     if (is_async) {
         thread_idle_buffer = apr_palloc(r->pool, server_limit * sizeof(int));
@@ -311,6 +317,7 @@ static int status_handler(request_rec *r)
         }
     }
 
+    ps_record = apr_palloc(r->pool, sizeof *ps_record);
     ws_record = apr_palloc(r->pool, sizeof *ws_record);
 
     for (i = 0; i < server_limit; ++i) {
@@ -319,7 +326,15 @@ static int status_handler(request_rec *r)
         clock_t tmp_tu, tmp_ts, tmp_tcu, tmp_tcs;
 #endif
 
-        ps_record = ap_get_scoreboard_process(i);
+        /* Snapshot all in one go */
+        ps = ap_get_scoreboard_process(i);
+        do {
+            proc_buffer[i].pid = ps->pid;
+            proc_buffer[i].gen = ps->generation;
+            memcpy(ps_record, (void *)ps, sizeof(*ps_record));
+        } while (ps_record->generation != proc_buffer[i].gen
+                 || ps_record->pid != proc_buffer[i].pid);
+
         if (is_async) {
             thread_idle_buffer[i] = 0;
             thread_graceful_buffer[i] = 0;
@@ -328,7 +343,12 @@ static int status_handler(request_rec *r)
         for (j = 0; j < thread_limit; ++j) {
             int indx = (i * thread_limit) + j;
 
-            ap_copy_scoreboard_worker(ws_record, i, j);
+            if (ps_record->pid) {
+                ap_copy_scoreboard_worker(ws_record, i, j);
+            }
+            else {
+                memset(ws_record, 0, sizeof(*ws_record));
+            }
             res = ws_record->status;
 
             if ((i >= max_servers || j >= threads_per_child)
@@ -337,8 +357,8 @@ static int status_handler(request_rec *r)
             else
                 stat_buffer[indx] = status_flags[res];
 
-            if (!ps_record->quiescing
-                && ps_record->pid) {
+            if (ps_record->pid
+                && !ps_record->quiescing) {
                 if (res == SERVER_READY) {
                     if (ps_record->generation == mpm_generation)
                         idle++;
@@ -410,7 +430,6 @@ static int status_handler(request_rec *r)
         tcu += proc_tcu;
         tcs += proc_tcs;
 #endif
-        pid_buffer[i] = ps_record->pid;
     }
 
     /* up_time in seconds */
@@ -426,14 +445,15 @@ static int status_handler(request_rec *r)
                  "<h1>Apache Server Status for ", r);
         ap_rvputs(r, ap_escape_html(r->pool, ap_get_server_name(r)),
                   " (via ", r->connection->local_ip,
-                  ")</h1>\n\n", NULL);
-        ap_rvputs(r, "<dl><dt>Server Version: ",
+                  ")</h1>\n", NULL);
+        ap_rvputs(r, "<dl>\n<dt>Server Version: ",
                   ap_get_server_description(), "</dt>\n", NULL);
-        ap_rvputs(r, "<dt>Server MPM: ",
-                  ap_show_mpm(), "</dt>\n", NULL);
         ap_rvputs(r, "<dt>Server Built: ",
-                  ap_get_server_built(), "\n</dt></dl><hr /><dl>\n", NULL);
-        ap_rvputs(r, "<dt>Current Time: ",
+                  ap_get_server_built(), "</dt>\n", NULL);
+        ap_rvputs(r, "<dt>Server MPM: ",
+                  ap_show_mpm(), "</dt>\n</dl>\n"
+                  "<hr />\n", NULL);
+        ap_rvputs(r, "<dl>\n<dt>Current Time: ",
                   ap_ht_time(r->pool, nowtime, DEFAULT_TIME_FORMAT, 0),
                              "</dt>\n", NULL);
         ap_rvputs(r, "<dt>Restart Time: ",
@@ -561,97 +581,131 @@ static int status_handler(request_rec *r)
         ap_rprintf(r, "BusyWorkers: %d\nGracefulWorkers: %d\nIdleWorkers: %d\n", busy, graceful, idle);
 
     if (!short_report)
-        ap_rputs("</dl>", r);
+        ap_rputs("</dl>\n", r);
 
     if (is_async) {
-        int wait_io = 0, write_completion = 0, shutdown = 0, lingering_close = 0,
-            keep_alive = 0, connections = 0, stopping = 0, procs = 0;
+        apr_uint32_t procs = 0, stopping = 0, accepting = 0,
+                     connections = 0, backlog = 0, wait_io = 0, writing = 0,
+                     keep_alive = 0, shutdown = 0, suspended = 0, closing = 0;
         if (!short_report)
-            ap_rputs("\n\n<table rules=\"all\" cellpadding=\"1%\">\n"
-                     "<tr><th rowspan=\"2\">Slot</th>"
-                         "<th rowspan=\"2\">PID</th>"
-                         "<th rowspan=\"2\">Stopping</th>"
-                         "<th colspan=\"2\">Connections</th>\n"
+            ap_rputs("<table rules=\"all\" cellpadding=\"1%\">\n"
+                     "<tr><th colspan=\"4\">Processes</th>"
                          "<th colspan=\"3\">Threads</th>"
-                         "<th colspan=\"4\">Async connections</th></tr>\n"
-                     "<tr><th>total</th><th>accepting</th>"
-                         "<th>busy</th><th>graceful</th><th>idle</th>"
-                         "<th>wait-io</th><th>writing</th><th>keep-alive</th>"
-                         "<th>shutdown</th><th>closing</th></tr>\n", r);
+                         "<th colspan=\"2\">Connections</th>"
+                         "<th colspan=\"6\">Async queues</th></tr>\n"
+                     "<tr><th>Slot</th><th>PID</th><th>stopping</th><th>accepting</th>"
+                         "<th>idle</th><th>busy</th><th>graceful</th>"
+                         "<th>total</th><th>backlog</th>"
+                         "<th>wait-io</th><th>writing</th>"
+                         "<th>keep-alive</th><th>shutdown</th>"
+                         "<th>suspended</th><th>closing</th></tr>\n",
+                     r);
         for (i = 0; i < server_limit; ++i) {
-            ps_record = ap_get_scoreboard_process(i);
-            if (ps_record->pid) {
+            ps = ap_get_scoreboard_process(i);
+            if (!proc_buffer[i].pid
+                || ps->pid != proc_buffer[i].pid
+                || ps->generation != proc_buffer[i].gen) {
+                continue;
+            }
+
+            /* Still the same as what we accounted for earlier? */
+            memcpy(ps_record, (void *)ps, sizeof(*ps_record));
+            if (ps_record->pid == proc_buffer[i].pid
+                && ps_record->generation == proc_buffer[i].gen) {
                 connections      += ps_record->connections;
+                backlog          += ps_record->backlog;
                 wait_io          += ps_record->wait_io;
-                write_completion += ps_record->write_completion;
+                writing          += ps_record->write_completion;
                 keep_alive       += ps_record->keep_alive;
                 shutdown         += ps_record->shutdown;
-                lingering_close  += ps_record->lingering_close;
+                suspended        += ps_record->suspended;
+                closing          += ps_record->lingering_close;
                 procs++;
                 if (ps_record->quiescing) {
                     stopping++;
                 }
+                if (!ps_record->not_accepting) {
+                    accepting++;
+                }
                 if (!short_report) {
                     const char *dying = "no";
                     const char *old = "";
+                    const char *listening = "yes";
                     if (ps_record->quiescing) {
                         dying = "yes";
                     }
-                    if (ps_record->generation != mpm_generation)
+                    if (ps_record->generation != mpm_generation) {
                         old = " (old gen)";
+                    }
+                    if (ps_record->not_accepting) {
+                        listening = "no";
+                    }
                     ap_rprintf(r, "<tr><td>%u</td><td>%" APR_PID_T_FMT "</td>"
-                                      "<td>%s%s</td>"
-                                      "<td>%u</td><td>%s</td>"
-                                      "<td>%u</td><td>%u</td><td>%u</td><td>%u</td>"
-                                      "<td>%u</td><td>%u</td><td>%u</td><td>%u</td>"
-                                      "</tr>\n",
+                                      "<td>%s%s</td><td>%s</td>"
+                                      "<td>%d</td><td>%d</td><td>%d</td>"
+                                      "<td>%u</td><td>%u</td>"
+                                      "<td>%u</td><td>%u</td>"
+                                      "<td>%u</td><td>%u</td>"
+                                      "<td>%u</td><td>%u</td></tr>\n",
                                i, ps_record->pid,
-                               dying, old,
-                               ps_record->connections,
-                               ps_record->not_accepting ? "no" : "yes",
+                               dying, old, listening,
+                               thread_idle_buffer[i],
                                thread_busy_buffer[i],
                                thread_graceful_buffer[i],
-                               thread_idle_buffer[i],
+                               ps_record->connections,
+                               ps_record->backlog,
                                ps_record->wait_io,
                                ps_record->write_completion,
                                ps_record->keep_alive,
                                ps_record->shutdown,
+                               ps_record->suspended,
                                ps_record->lingering_close);
                 }
             }
         }
         if (!short_report) {
             ap_rprintf(r, "<tr><td>Sum</td>"
-                          "<td>%d</td><td>%d</td>"
-                          "<td>%d</td><td>&nbsp;</td>"
-                          "<td>%d</td><td>%d</td><td>%d</td><td>%d</td>"
-                          "<td>%d</td><td>%d</td><td>%d</td><td>%d</td>"
-                          "</tr>\n</table>\n",
-                          procs, stopping,
-                          connections,
-                          busy, graceful, idle,
-                          wait_io, write_completion, keep_alive,
-                          shutdown, lingering_close);
+                              "<td>%u</td><td>%u</td><td>%u</td>"
+                              "<td>%u</td><td>%u</td><td>%u</td>"
+                              "<td>%u</td><td>%u</td>"
+                              "<td>%u</td><td>%u</td>"
+                              "<td>%u</td><td>%u</td>"
+                              "<td>%u</td><td>%u</td></tr>\n"
+                          "</table>\n",
+                          procs, stopping, accepting,
+                          idle, busy, graceful,
+                          connections, backlog,
+                          wait_io, writing,
+                          keep_alive, shutdown,
+                          suspended, closing);
         }
         else {
-            ap_rprintf(r, "Processes: %d\n"
-                          "Stopping: %d\n"
-                          "ConnsTotal: %d\n"
-                          "ConnsAsyncWaitIO: %d\n"
-                          "ConnsAsyncWriting: %d\n"
-                          "ConnsAsyncKeepAlive: %d\n"
-                          "ConnsAsyncShutdown: %d\n"
-                          "ConnsAsyncClosing: %d\n",
-                          procs, stopping,
-                          connections,
-                          wait_io, write_completion, keep_alive,
-                          shutdown, lingering_close);
+            ap_rprintf(r, "Processes: %u\n"
+                          "Stopping: %u\n"
+                          "Accepting: %u\n"
+                          "ThreadsIdle: %u\n"
+                          "ThreadsBusy: %u\n"
+                          "ThreadsGraceful: %u\n"
+                          "ConnsTotal: %u\n"
+                          "ConnsBacklog: %u\n"
+                          "ConnsAsyncWaitIO: %u\n"
+                          "ConnsAsyncWriting: %u\n"
+                          "ConnsAsyncKeepAlive: %u\n"
+                          "ConnsAsyncShutdown: %u\n"
+                          "ConnsAsyncSuspended: %u\n"
+                          "ConnsAsyncClosing: %u\n",
+                          procs, stopping, accepting,
+                          busy, idle, graceful,
+                          connections, backlog,
+                          wait_io, writing,
+                          keep_alive, shutdown,
+                          suspended, closing);
         }
     }
 
     /* send the scoreboard 'table' out */
     if (!short_report)
-        ap_rputs("<pre>", r);
+        ap_rputs("<pre>\n", r);
     else
         ap_rputs("Scoreboard: ", r);
 
@@ -673,11 +727,11 @@ static int status_handler(request_rec *r)
     if (short_report)
         ap_rputs("\n", r);
     else {
-        ap_rputs("</pre>\n"
+        ap_rputs("\n</pre>\n"
                  "<p>Scoreboard Key:<br />\n"
                  "\"<b><code>_</code></b>\" Waiting for Connection, \n"
                  "\"<b><code>S</code></b>\" Starting up, \n"
-                 "\"<b><code>R</code></b>\" Reading Request,<br />\n"
+                 "\"<b><code>R</code></b>\" Waiting I/O,<br />\n"
                  "\"<b><code>W</code></b>\" Sending Reply, \n"
                  "\"<b><code>K</code></b>\" Keepalive (read), \n"
                  "\"<b><code>D</code></b>\" DNS Lookup,<br />\n"
@@ -690,17 +744,21 @@ static int status_handler(request_rec *r)
         if (!ap_extended_status) {
             int j;
             int k = 0;
-            ap_rputs("PID Key: <br />\n"
+            ap_rputs("<p>PID Key:<br />\n"
                      "<pre>\n", r);
             for (i = 0; i < server_limit; ++i) {
+                ps = ap_get_scoreboard_process(i);
+                if (!proc_buffer[i].pid
+                    || ps->pid != proc_buffer[i].pid
+                    || ps->generation != proc_buffer[i].gen) {
+                    continue;
+                }
                 for (j = 0; j < thread_limit; ++j) {
                     int indx = (i * thread_limit) + j;
 
-                    if (stat_buffer[indx] != '.') {
-                        ap_rprintf(r, "   %" APR_PID_T_FMT
-                                   " in state: %c ", pid_buffer[i],
-                                   stat_buffer[indx]);
-
+                    if (stat_buffer[indx] != status_flags[SERVER_DISABLED]) {
+                        ap_rprintf(r, "  %8" APR_PID_T_FMT " in state: %c ",
+                                   proc_buffer[i].pid, stat_buffer[indx]);
                         if (++k >= 3) {
                             ap_rputs("\n", r);
                             k = 0;
@@ -709,17 +767,16 @@ static int status_handler(request_rec *r)
                     }
                 }
             }
-
-            ap_rputs("\n"
-                     "</pre>\n", r);
+            ap_rvputs(r, k ? "\n" : "", "</pre>\n", "</p>\n", NULL);
         }
     }
 
     if (ap_extended_status && !short_report) {
         if (no_table_report)
-            ap_rputs("<hr /><h2>Server Details</h2>\n\n", r);
+            ap_rputs("<hr />\n<h2>Server Details</h2>\n", r);
         else
-            ap_rputs("\n\n<table border=\"0\"><tr>"
+            ap_rputs("<hr />\n"
+                     "<table border=\"0\"><tr>"
                      "<th>Srv</th><th>PID</th><th>Acc</th>"
                      "<th>M</th>"
 #ifdef HAVE_TIMES
@@ -728,9 +785,16 @@ static int status_handler(request_rec *r)
                      "<th>SS</th><th>Req</th><th>Dur</th>"
                      "<th>Conn</th><th>Child</th><th>Slot</th>"
                      "<th>Client</th><th>Protocol</th><th>VHost</th>"
-                     "<th>Request</th></tr>\n\n", r);
+                     "<th>Request</th></tr>\n", r);
 
         for (i = 0; i < server_limit; ++i) {
+            ps = ap_get_scoreboard_process(i);
+            if (!proc_buffer[i].pid
+                || ps->pid != proc_buffer[i].pid
+                || ps->generation != proc_buffer[i].gen) {
+                continue;
+            }
+
             for (j = 0; j < thread_limit; ++j) {
                 ap_copy_scoreboard_worker(ws_record, i, j);
 
@@ -740,8 +804,6 @@ static int status_handler(request_rec *r)
                     continue;
                 }
 
-                ps_record = ap_get_scoreboard_process(i);
-
                 if (ws_record->start_time == 0L)
                     req_time = 0L;
                 else
@@ -763,8 +825,8 @@ static int status_handler(request_rec *r)
                     worker_generation = ws_record->generation;
                 }
                 else {
-                    worker_pid = ps_record->pid;
-                    worker_generation = ps_record->generation;
+                    worker_pid = proc_buffer[i].pid;
+                    worker_generation = proc_buffer[i].gen;
                 }
 
                 if (no_table_report) {
@@ -842,7 +904,7 @@ static int status_handler(request_rec *r)
                     format_byte_out(r, bytes);
                     ap_rputs(")\n", r);
                     ap_rprintf(r,
-                               " <i>%s {%s}</i> <i>(%s)</i> <b>[%s]</b><br />\n\n",
+                               " <i>%s {%s}</i> <i>(%s)</i> <b>[%s]</b><br />\n",
                                ap_escape_html(r->pool,
                                               ws_record->client64),
                                ap_escape_html(r->pool,
@@ -929,7 +991,7 @@ static int status_handler(request_rec *r)
                                (float)bytes / MBYTE);
 
                     ap_rprintf(r, "</td><td>%s</td><td>%s</td><td nowrap>%s</td>"
-                                  "<td nowrap>%s</td></tr>\n\n",
+                                  "<td nowrap>%s</td></tr>\n",
                                ap_escape_html(r->pool,
                                               ws_record->client64),
                                ap_escape_html(r->pool,
@@ -945,7 +1007,7 @@ static int status_handler(request_rec *r)
 
         if (!no_table_report) {
             ap_rputs("</table>\n \
-<hr /> \
+<p>\n \
 <table>\n \
 <tr><th>Srv</th><td>Child Server number - generation</td></tr>\n \
 <tr><th>PID</th><td>OS process ID</td></tr>\n \
@@ -962,13 +1024,15 @@ static int status_handler(request_rec *r)
 <tr><th>Conn</th><td>Kilobytes transferred this connection</td></tr>\n \
 <tr><th>Child</th><td>Megabytes transferred this child</td></tr>\n \
 <tr><th>Slot</th><td>Total megabytes transferred this slot</td></tr>\n \
-</table>\n", r);
+</table>\n \
+</p>", r);
         }
     } /* if (ap_extended_status && !short_report) */
     else {
 
         if (!short_report) {
-            ap_rputs("<hr />To obtain a full report with current status "
+            ap_rputs("<hr />\n"
+                     "To obtain a full report with current status "
                      "information you need to use the "
                      "<code>ExtendedStatus On</code> directive.\n", r);
         }
@@ -986,7 +1050,7 @@ static int status_handler(request_rec *r)
 
     if (!short_report) {
         ap_rputs(ap_psignature("<hr />\n",r), r);
-        ap_rputs("</body></html>\n", r);
+        ap_rputs("</body>\n</html>\n", r);
     }
 
     return 0;
diff --git a/modules/lua/lua_request.c b/modules/lua/lua_request.c
index f93c3493af4..51cf63f565a 100644
--- a/modules/lua/lua_request.c
+++ b/modules/lua/lua_request.c
@@ -1268,11 +1268,11 @@ static int lua_ap_scoreboard_process(lua_State *L)
         lua_pushnumber(L, ps_record->suspended);
         lua_settable(L, -3);
 
-        lua_pushstring(L, "wait_io");
+        lua_pushstring(L, "wait-io");
         lua_pushnumber(L, ps_record->wait_io);
         lua_settable(L, -3);
 
-        lua_pushstring(L, "write_completion");
+        lua_pushstring(L, "writing");
         lua_pushnumber(L, ps_record->write_completion);
         lua_settable(L, -3);