thirdparty: update picohttpparser (#20843)

2025-09-12 17:07:11 -04:00 · 2024-02-16 05:36:22 -04:00 · 2024-02-16 05:36:22 -04:00 · 09c35acb09
commit 09c35acb09
parent a21658b9fb
2 changed files with 74 additions and 31 deletions
--- a/thirdparty/picohttpparser/src/picohttpparser.c
+++ b/thirdparty/picohttpparser/src/picohttpparser.c
@ -241,6 +241,41 @@ static const char *is_complete(const char *buf, const char *buf_end, size_t last
        *valp_ += res_;                                                                                                            \
    } while (0)

+/* returned pointer is always within [buf, buf_end), or null */
+static const char *parse_token(const char *buf, const char *buf_end, const char **token, size_t *token_len, char next_char,
+                               int *ret)
+{
+    /* We use pcmpestri to detect non-token characters. This instruction can take no more than eight character ranges (8*2*8=128
+     * bits that is the size of a SSE register). Due to this restriction, characters `|` and `~` are handled in the slow loop. */
+    static const char ALIGNED(16) ranges[] = "\x00 "  /* control chars and up to SP */
+                                             "\"\""   /* 0x22 */
+                                             "()"     /* 0x28,0x29 */
+                                             ",,"     /* 0x2c */
+                                             "//"     /* 0x2f */
+                                             ":@"     /* 0x3a-0x40 */
+                                             "[]"     /* 0x5b-0x5d */
+                                             "{\xff"; /* 0x7b-0xff */
+    const char *buf_start = buf;
+    int found;
+    buf = findchar_fast(buf, buf_end, ranges, sizeof(ranges) - 1, &found);
+    if (!found) {
+        CHECK_EOF();
+    }
+    while (1) {
+        if (*buf == next_char) {
+            break;
+        } else if (!token_char_map[(unsigned char)*buf]) {
+            *ret = -1;
+            return NULL;
+        }
+        ++buf;
+        CHECK_EOF();
+    }
+    *token = buf_start;
+    *token_len = buf - buf_start;
+    return buf;
+}
+
 /* returned pointer is always within [buf, buf_end), or null */
 static const char *parse_http_version(const char *buf, const char *buf_end, int *minor_version, int *ret)
 {
@ -280,31 +315,10 @@ static const char *parse_headers(const char *buf, const char *buf_end, struct ph
        if (!(*num_headers != 0 && (*buf == ' ' || *buf == '\t'))) {
            /* parsing name, but do not discard SP before colon, see
             * http://www.mozilla.org/security/announce/2006/mfsa2006-33.html */
-            headers[*num_headers].name = buf;
-            static const char ALIGNED(16) ranges1[] = "\x00 "  /* control chars and up to SP */
-                                                      "\"\""   /* 0x22 */
-                                                      "()"     /* 0x28,0x29 */
-                                                      ",,"     /* 0x2c */
-                                                      "//"     /* 0x2f */
-                                                      ":@"     /* 0x3a-0x40 */
-                                                      "[]"     /* 0x5b-0x5d */
-                                                      "{\377"; /* 0x7b-0xff */
-            int found;
-            buf = findchar_fast(buf, buf_end, ranges1, sizeof(ranges1) - 1, &found);
-            if (!found) {
-                CHECK_EOF();
-            }
-            while (1) {
-                if (*buf == ':') {
-                    break;
-                } else if (!token_char_map[(unsigned char)*buf]) {
-                    *ret = -1;
+            if ((buf = parse_token(buf, buf_end, &headers[*num_headers].name, &headers[*num_headers].name_len, ':', ret)) == NULL) {
                return NULL;
            }
-                ++buf;
-                CHECK_EOF();
-            }
-            if ((headers[*num_headers].name_len = buf - headers[*num_headers].name) == 0) {
+            if (headers[*num_headers].name_len == 0) {
                *ret = -1;
                return NULL;
            }
@ -352,13 +366,17 @@ static const char *parse_request(const char *buf, const char *buf_end, const cha
    }

    /* parse request line */
-    ADVANCE_TOKEN(*method, *method_len);
+    if ((buf = parse_token(buf, buf_end, method, method_len, ' ', ret)) == NULL) {
+        return NULL;
+    }
    do {
        ++buf;
+        CHECK_EOF();
    } while (*buf == ' ');
    ADVANCE_TOKEN(*path, *path_len);
    do {
        ++buf;
+        CHECK_EOF();
    } while (*buf == ' ');
    if (*method_len == 0 || *path_len == 0) {
        *ret = -1;
@ -422,6 +440,7 @@ static const char *parse_response(const char *buf, const char *buf_end, int *min
    }
    do {
        ++buf;
+        CHECK_EOF();
    } while (*buf == ' ');
    /* parse status code, we want at least [:digit:][:digit:][:digit:]<other char> to try to parse */
    if (buf_end - buf < 4) {
@ -430,14 +449,15 @@ static const char *parse_response(const char *buf, const char *buf_end, int *min
    }
    PARSE_INT_3(status);

-    /* get message includig preceding space */
+    /* get message including preceding space */
    if ((buf = get_token_to_eol(buf, buf_end, msg, msg_len, ret)) == NULL) {
        return NULL;
    }
    if (*msg_len == 0) {
        /* ok */
    } else if (**msg == ' ') {
-        /* remove preceding space */
+        /* Remove preceding space. Successful return from `get_token_to_eol` guarantees that we would hit something other than SP
+         * before running past the end of the given buffer. */
        do {
            ++*msg;
            --*msg_len;
@ -525,6 +545,8 @@ ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf, size_
    size_t dst = 0, src = 0, bufsz = *_bufsz;
    ssize_t ret = -2; /* incomplete */

+    decoder->_total_read += bufsz;
+
    while (1) {
        switch (decoder->_state) {
        case CHUNKED_IN_CHUNK_SIZE:
@ -537,6 +559,18 @@ ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf, size_
                        ret = -1;
                        goto Exit;
                    }
+                    /* the only characters that may appear after the chunk size are BWS, semicolon, or CRLF */
+                    switch (buf[src]) {
+                    case ' ':
+                    case '\011':
+                    case ';':
+                    case '\012':
+                    case '\015':
+                        break;
+                    default:
+                        ret = -1;
+                        goto Exit;
+                    }
                    break;
                }
                if (decoder->_hex_count == sizeof(size_t) * 2) {
@ -632,6 +666,12 @@ Exit:
    if (dst != src)
        memmove(buf + dst, buf + src, bufsz - src);
    *_bufsz = dst;
+    /* if incomplete but the overhead of the chunked encoding is >=100KB and >80%, signal an error */
+    if (ret == -2) {
+        decoder->_total_overhead += bufsz - dst;
+        if (decoder->_total_overhead >= 100 * 1024 && decoder->_total_read - decoder->_total_overhead < decoder->_total_read / 4)
+            ret = -1;
+    }
    return ret;
 }

--- a/thirdparty/picohttpparser/src/picohttpparser.h
+++ b/thirdparty/picohttpparser/src/picohttpparser.h
@ -27,6 +27,7 @@
 #ifndef picohttpparser_h
 #define picohttpparser_h

+#include <stdint.h>
 #include <sys/types.h>

 #ifdef _MSC_VER
@ -39,12 +40,12 @@ extern "C" {

 /* contains name and value of a header (name == NULL if is a continuing line
 * of a multiline header */
-typedef struct phr_header {
+struct phr_header {
    const char *name;
    size_t name_len;
    const char *value;
    size_t value_len;
-}phr_header;
+};

 /* returns number of bytes consumed if successful, -2 if request is partial,
 * -1 if failed */
@ -64,6 +65,8 @@ struct phr_chunked_decoder {
    char consume_trailer;       /* if trailing headers should be consumed */
    char _hex_count;
    char _state;
+    uint64_t _total_read;
+    uint64_t _total_overhead;
 };

 /* the function rewrites the buffer given as (buf, bufsz) removing the chunked-
@ -72,8 +75,8 @@ struct phr_chunked_decoder {
 * repeatedly call the function while it returns -2 (incomplete) every time
 * supplying newly arrived data.  If the end of the chunked-encoded data is
 * found, the function returns a non-negative number indicating the number of
- * octets left undecoded at the tail of the supplied buffer.  Returns -1 on
- * error.
+ * octets left undecoded, that starts from the offset returned by `*bufsz`.
+ * Returns -1 on error.
 */
 ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf, size_t *bufsz);