urlapi: split parts of parseurl() into sub functions

- parse_file - parse_scheme - guess_scheme - handle_fragment - handle_query - handle_path Closes #20205
2026-04-11 12:01:42 +08:00 · 2026-01-07 09:26:14 +01:00 · 2026-01-07 09:26:14 +01:00 · 5f612acaa1
commit 5f612acaa1
parent f6e8531c03
2 changed files with 322 additions and 303 deletions
--- a/lib/urlapi.c
+++ b/lib/urlapi.c
@ -898,17 +898,301 @@ end:
  return result ? 1 : 0; /* success */
 }

+static CURLUcode parse_file(const char *url, size_t urllen, CURLU *u,
+                            struct dynbuf *host, const char **pathp,
+                            size_t *pathlenp)
+{
+  const char *path;
+  size_t pathlen;
+  bool uncpath = FALSE;
+  if(urllen <= 6)
+    /* file:/ is not enough to actually be a complete file: URL */
+    return CURLUE_BAD_FILE_URL;
+
+  /* path has been allocated large enough to hold this */
+  path = &url[5];
+  pathlen = urllen - 5;
+
+  u->scheme = curlx_strdup("file");
+  if(!u->scheme)
+    return CURLUE_OUT_OF_MEMORY;
+
+  /* Extra handling URLs with an authority component (i.e. that start with
+   * "file://")
+   *
+   * We allow omitted hostname (e.g. file:/<path>) -- valid according to
+   * RFC 8089, but not the (current) WHAT-WG URL spec.
+   */
+  if(path[0] == '/' && path[1] == '/') {
+    /* swallow the two slashes */
+    const char *ptr = &path[2];
+
+    /*
+     * According to RFC 8089, a file: URL can be reliably dereferenced if:
+     *
+     *  o it has no/blank hostname, or
+     *
+     *  o the hostname matches "localhost" (case-insensitively), or
+     *
+     *  o the hostname is a FQDN that resolves to this machine, or
+     *
+     *  o it is an UNC String transformed to an URI (Windows only, RFC 8089
+     *    Appendix E.3).
+     *
+     * For brevity, we only consider URLs with empty, "localhost", or
+     * "127.0.0.1" hostnames as local, otherwise as an UNC String.
+     *
+     * Additionally, there is an exception for URLs with a Windows drive
+     * letter in the authority (which was accidentally omitted from RFC 8089
+     * Appendix E, but believe me, it was meant to be there. --MK)
+     */
+    if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) {
+      /* the URL includes a hostname, it must match "localhost" or
+         "127.0.0.1" to be valid */
+      if(checkprefix("localhost/", ptr) ||
+         checkprefix("127.0.0.1/", ptr)) {
+        ptr += 9; /* now points to the slash after the host */
+      }
+      else {
+#ifdef _WIN32
+        size_t len;
+
+        /* the hostname, NetBIOS computer name, can not contain disallowed
+           chars, and the delimiting slash character must be appended to the
+           hostname */
+        path = strpbrk(ptr, "/\\:*?\"<>|");
+        if(!path || *path != '/')
+          return CURLUE_BAD_FILE_URL;
+
+        len = path - ptr;
+        if(len) {
+          CURLcode code = curlx_dyn_addn(host, ptr, len);
+          if(code)
+            return cc2cu(code);
+          uncpath = TRUE;
+        }
+
+        ptr -= 2; /* now points to the // before the host in UNC */
+#else
+        /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
+           none */
+        return CURLUE_BAD_FILE_URL;
+#endif
+      }
+    }
+
+    path = ptr;
+    pathlen = urllen - (ptr - url);
+  }
+
+  if(!uncpath)
+    /* no host for file: URLs by default */
+    curlx_dyn_reset(host);
+
+#if !defined(_WIN32) && !defined(MSDOS) && !defined(__CYGWIN__)
+  /* Do not allow Windows drive letters when not in Windows.
+   * This catches both "file:/c:" and "file:c:" */
+  if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
+     STARTS_WITH_URL_DRIVE_PREFIX(path)) {
+    /* File drive letters are only accepted in MS-DOS/Windows */
+    return CURLUE_BAD_FILE_URL;
+  }
+#else
+  /* If the path starts with a slash and a drive letter, ditch the slash */
+  if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
+    /* This cannot be done with strcpy, as the memory chunks overlap! */
+    path++;
+    pathlen--;
+  }
+#endif
+  *pathp = path;
+  *pathlenp = pathlen;
+  return CURLUE_OK;
+}
+
+static CURLUcode parse_scheme(const char *url, CURLU *u, char *schemebuf,
+                              size_t schemelen, unsigned int flags,
+                              const char **hostpp)
+{
+  /* clear path */
+  const char *schemep = NULL;
+
+  if(schemelen) {
+    int i = 0;
+    const char *p = &url[schemelen + 1];
+    while((*p == '/') && (i < 4)) {
+      p++;
+      i++;
+    }
+
+    schemep = schemebuf;
+    if(!Curl_get_scheme_handler(schemep) &&
+       !(flags & CURLU_NON_SUPPORT_SCHEME))
+      return CURLUE_UNSUPPORTED_SCHEME;
+
+    if((i < 1) || (i > 3))
+      /* less than one or more than three slashes */
+      return CURLUE_BAD_SLASHES;
+
+    *hostpp = p; /* hostname starts here */
+  }
+  else {
+    /* no scheme! */
+
+    if(!(flags & (CURLU_DEFAULT_SCHEME | CURLU_GUESS_SCHEME)))
+      return CURLUE_BAD_SCHEME;
+
+    if(flags & CURLU_DEFAULT_SCHEME)
+      schemep = DEFAULT_SCHEME;
+
+    /*
+     * The URL was badly formatted, let's try without scheme specified.
+     */
+    *hostpp = url;
+  }
+
+  if(schemep) {
+    u->scheme = curlx_strdup(schemep);
+    if(!u->scheme)
+      return CURLUE_OUT_OF_MEMORY;
+  }
+  return CURLUE_OK;
+}
+
+static CURLUcode guess_scheme(CURLU *u, struct dynbuf *host)
+{
+  const char *hostname = curlx_dyn_ptr(host);
+  const char *schemep = NULL;
+  /* legacy curl-style guess based on hostname */
+  if(checkprefix("ftp.", hostname))
+    schemep = "ftp";
+  else if(checkprefix("dict.", hostname))
+    schemep = "dict";
+  else if(checkprefix("ldap.", hostname))
+    schemep = "ldap";
+  else if(checkprefix("imap.", hostname))
+    schemep = "imap";
+  else if(checkprefix("smtp.", hostname))
+    schemep = "smtp";
+  else if(checkprefix("pop3.", hostname))
+    schemep = "pop3";
+  else
+    schemep = "http";
+
+  u->scheme = curlx_strdup(schemep);
+  if(!u->scheme)
+    return CURLUE_OUT_OF_MEMORY;
+
+  u->guessed_scheme = TRUE;
+  return CURLUE_OK;
+}
+
+static CURLUcode handle_fragment(CURLU *u, const char *fragment,
+                                 size_t fraglen, unsigned int flags)
+{
+  CURLUcode result;
+  u->fragment_present = TRUE;
+  if(fraglen > 1) {
+    /* skip the leading '#' in the copy but include the terminating null */
+    if(flags & CURLU_URLENCODE) {
+      struct dynbuf enc;
+      curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
+      result = urlencode_str(&enc, fragment + 1, fraglen - 1, TRUE, FALSE);
+      if(result)
+        return result;
+      u->fragment = curlx_dyn_ptr(&enc);
+    }
+    else {
+      u->fragment = Curl_memdup0(fragment + 1, fraglen - 1);
+      if(!u->fragment)
+        return CURLUE_OUT_OF_MEMORY;
+    }
+  }
+  return CURLUE_OK;
+}
+
+static CURLUcode handle_query(CURLU *u, const char *query,
+                              size_t qlen, unsigned int flags)
+{
+  u->query_present = TRUE;
+  if(qlen > 1) {
+    if(flags & CURLU_URLENCODE) {
+      struct dynbuf enc;
+      CURLUcode result;
+      curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
+      /* skip the leading question mark */
+      result = urlencode_str(&enc, query + 1, qlen - 1, TRUE, TRUE);
+      if(result)
+        return result;
+      u->query = curlx_dyn_ptr(&enc);
+    }
+    else {
+      u->query = Curl_memdup0(query + 1, qlen - 1);
+      if(!u->query)
+        return CURLUE_OUT_OF_MEMORY;
+    }
+  }
+  else {
+    /* single byte query */
+    u->query = curlx_strdup("");
+    if(!u->query)
+      return CURLUE_OUT_OF_MEMORY;
+  }
+  return CURLUE_OK;
+}
+
+static CURLUcode handle_path(CURLU *u, const char *path,
+                             size_t pathlen, unsigned int flags)
+{
+  CURLUcode result;
+  if(pathlen && (flags & CURLU_URLENCODE)) {
+    struct dynbuf enc;
+    curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
+    result = urlencode_str(&enc, path, pathlen, TRUE, FALSE);
+    if(result)
+      return result;
+    pathlen = curlx_dyn_len(&enc);
+    path = u->path = curlx_dyn_ptr(&enc);
+  }
+
+  if(pathlen <= 1) {
+    /* there is no path left or just the slash, unset */
+    path = NULL;
+  }
+  else {
+    if(!u->path) {
+      u->path = Curl_memdup0(path, pathlen);
+      if(!u->path)
+        return CURLUE_OUT_OF_MEMORY;
+      path = u->path;
+    }
+    else if(flags & CURLU_URLENCODE)
+      /* it might have encoded more than just the path so cut it */
+      u->path[pathlen] = 0;
+
+    if(!(flags & CURLU_PATH_AS_IS)) {
+      /* remove ../ and ./ sequences according to RFC3986 */
+      char *dedot;
+      int err = dedotdotify(path, pathlen, &dedot);
+      if(err)
+        return CURLUE_OUT_OF_MEMORY;
+      if(dedot) {
+        curlx_free(u->path);
+        u->path = dedot;
+      }
+    }
+  }
+  return CURLUE_OK;
+}
+
 static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
 {
  const char *path;
  size_t pathlen;
-  char *query = NULL;
-  char *fragment = NULL;
  char schemebuf[MAX_SCHEME_LEN + 1];
  size_t schemelen = 0;
  size_t urllen;
  CURLUcode result = CURLUE_OK;
-  size_t fraglen = 0;
  struct dynbuf host;

  DEBUGASSERT(url);
@ -924,170 +1208,14 @@ static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
                                            CURLU_DEFAULT_SCHEME));

  /* handle the file: scheme */
-  if(schemelen && !strcmp(schemebuf, "file")) {
-    bool uncpath = FALSE;
-    if(urllen <= 6) {
-      /* file:/ is not enough to actually be a complete file: URL */
-      result = CURLUE_BAD_FILE_URL;
-      goto fail;
-    }
-
-    /* path has been allocated large enough to hold this */
-    path = &url[5];
-    pathlen = urllen - 5;
-
-    u->scheme = curlx_strdup("file");
-    if(!u->scheme) {
-      result = CURLUE_OUT_OF_MEMORY;
-      goto fail;
-    }
-
-    /* Extra handling URLs with an authority component (i.e. that start with
-     * "file://")
-     *
-     * We allow omitted hostname (e.g. file:/<path>) -- valid according to
-     * RFC 8089, but not the (current) WHAT-WG URL spec.
-     */
-    if(path[0] == '/' && path[1] == '/') {
-      /* swallow the two slashes */
-      const char *ptr = &path[2];
-
-      /*
-       * According to RFC 8089, a file: URL can be reliably dereferenced if:
-       *
-       *  o it has no/blank hostname, or
-       *
-       *  o the hostname matches "localhost" (case-insensitively), or
-       *
-       *  o the hostname is a FQDN that resolves to this machine, or
-       *
-       *  o it is an UNC String transformed to an URI (Windows only, RFC 8089
-       *    Appendix E.3).
-       *
-       * For brevity, we only consider URLs with empty, "localhost", or
-       * "127.0.0.1" hostnames as local, otherwise as an UNC String.
-       *
-       * Additionally, there is an exception for URLs with a Windows drive
-       * letter in the authority (which was accidentally omitted from RFC 8089
-       * Appendix E, but believe me, it was meant to be there. --MK)
-       */
-      if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) {
-        /* the URL includes a hostname, it must match "localhost" or
-           "127.0.0.1" to be valid */
-        if(checkprefix("localhost/", ptr) ||
-           checkprefix("127.0.0.1/", ptr)) {
-          ptr += 9; /* now points to the slash after the host */
-        }
-        else {
-#ifdef _WIN32
-          size_t len;
-
-          /* the hostname, NetBIOS computer name, can not contain disallowed
-             chars, and the delimiting slash character must be appended to the
-             hostname */
-          path = strpbrk(ptr, "/\\:*?\"<>|");
-          if(!path || *path != '/') {
-            result = CURLUE_BAD_FILE_URL;
-            goto fail;
-          }
-
-          len = path - ptr;
-          if(len) {
-            CURLcode code = curlx_dyn_addn(&host, ptr, len);
-            if(code) {
-              result = cc2cu(code);
-              goto fail;
-            }
-            uncpath = TRUE;
-          }
-
-          ptr -= 2; /* now points to the // before the host in UNC */
-#else
-          /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
-             none */
-          result = CURLUE_BAD_FILE_URL;
-          goto fail;
-#endif
-        }
-      }
-
-      path = ptr;
-      pathlen = urllen - (ptr - url);
-    }
-
-    if(!uncpath)
-      /* no host for file: URLs by default */
-      curlx_dyn_reset(&host);
-
-#if !defined(_WIN32) && !defined(MSDOS) && !defined(__CYGWIN__)
-    /* Do not allow Windows drive letters when not in Windows.
-     * This catches both "file:/c:" and "file:c:" */
-    if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
-       STARTS_WITH_URL_DRIVE_PREFIX(path)) {
-      /* File drive letters are only accepted in MS-DOS/Windows */
-      result = CURLUE_BAD_FILE_URL;
-      goto fail;
-    }
-#else
-    /* If the path starts with a slash and a drive letter, ditch the slash */
-    if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
-      /* This cannot be done with strcpy, as the memory chunks overlap! */
-      path++;
-      pathlen--;
-    }
-#endif
-  }
+  if(schemelen && !strcmp(schemebuf, "file"))
+    result = parse_file(url, urllen, u, &host, &path, &pathlen);
  else {
-    /* clear path */
-    const char *schemep = NULL;
-    const char *hostp;
+    const char *hostp = NULL;
    size_t hostlen;
-
-    if(schemelen) {
-      int i = 0;
-      const char *p = &url[schemelen + 1];
-      while((*p == '/') && (i < 4)) {
-        p++;
-        i++;
-      }
-
-      schemep = schemebuf;
-      if(!Curl_get_scheme_handler(schemep) &&
-         !(flags & CURLU_NON_SUPPORT_SCHEME)) {
-        result = CURLUE_UNSUPPORTED_SCHEME;
-        goto fail;
-      }
-
-      if((i < 1) || (i > 3)) {
-        /* less than one or more than three slashes */
-        result = CURLUE_BAD_SLASHES;
-        goto fail;
-      }
-      hostp = p; /* hostname starts here */
-    }
-    else {
-      /* no scheme! */
-
-      if(!(flags & (CURLU_DEFAULT_SCHEME | CURLU_GUESS_SCHEME))) {
-        result = CURLUE_BAD_SCHEME;
-        goto fail;
-      }
-      if(flags & CURLU_DEFAULT_SCHEME)
-        schemep = DEFAULT_SCHEME;
-
-      /*
-       * The URL was badly formatted, let's try without scheme specified.
-       */
-      hostp = url;
-    }
-
-    if(schemep) {
-      u->scheme = curlx_strdup(schemep);
-      if(!u->scheme) {
-        result = CURLUE_OUT_OF_MEMORY;
-        goto fail;
-      }
-    }
+    result = parse_scheme(url, u, schemebuf, schemelen, flags, &hostp);
+    if(result)
+      goto fail;

    /* find the end of the hostname + port number */
    hostlen = strcspn(hostp, "/?#");
@ -1096,155 +1224,45 @@ static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
    /* this pathlen also contains the query and the fragment */
    pathlen = urllen - (path - url);
    if(hostlen) {
-
-      result = parse_authority(u, hostp, hostlen, flags, &host, schemelen);
-      if(result)
-        goto fail;
-
-      if((flags & CURLU_GUESS_SCHEME) && !schemep) {
-        const char *hostname = curlx_dyn_ptr(&host);
-        /* legacy curl-style guess based on hostname */
-        if(checkprefix("ftp.", hostname))
-          schemep = "ftp";
-        else if(checkprefix("dict.", hostname))
-          schemep = "dict";
-        else if(checkprefix("ldap.", hostname))
-          schemep = "ldap";
-        else if(checkprefix("imap.", hostname))
-          schemep = "imap";
-        else if(checkprefix("smtp.", hostname))
-          schemep = "smtp";
-        else if(checkprefix("pop3.", hostname))
-          schemep = "pop3";
-        else
-          schemep = "http";
-
-        u->scheme = curlx_strdup(schemep);
-        if(!u->scheme) {
-          result = CURLUE_OUT_OF_MEMORY;
-          goto fail;
-        }
-        u->guessed_scheme = TRUE;
-      }
+      result = parse_authority(u, hostp, hostlen, flags, &host,
+                               u->scheme != NULL);
+      if(!result && (flags & CURLU_GUESS_SCHEME) && !u->scheme)
+        result = guess_scheme(u, &host);
    }
    else if(flags & CURLU_NO_AUTHORITY) {
      /* allowed to be empty. */
-      if(curlx_dyn_add(&host, "")) {
+      if(curlx_dyn_add(&host, ""))
        result = CURLUE_OUT_OF_MEMORY;
-        goto fail;
-      }
    }
-    else {
+    else
      result = CURLUE_NO_HOST;
-      goto fail;
+  }
+  if(!result) {
+    /* The path might at this point contain a fragment and/or a query to
+       handle */
+    const char *fragment = strchr(path, '#');
+    if(fragment) {
+      size_t fraglen = pathlen - (fragment - path);
+      result = handle_fragment(u, fragment, fraglen, flags);
+      /* after this, pathlen still contains the query */
+      pathlen -= fraglen;
    }
  }
-
-  fragment = strchr(path, '#');
-  if(fragment) {
-    fraglen = pathlen - (fragment - path);
-    u->fragment_present = TRUE;
-    if(fraglen > 1) {
-      /* skip the leading '#' in the copy but include the terminating null */
-      if(flags & CURLU_URLENCODE) {
-        struct dynbuf enc;
-        curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
-        result = urlencode_str(&enc, fragment + 1, fraglen - 1, TRUE, FALSE);
-        if(result)
-          goto fail;
-        u->fragment = curlx_dyn_ptr(&enc);
-      }
-      else {
-        u->fragment = Curl_memdup0(fragment + 1, fraglen - 1);
-        if(!u->fragment) {
-          result = CURLUE_OUT_OF_MEMORY;
-          goto fail;
-        }
-      }
-    }
-    /* after this, pathlen still contains the query */
-    pathlen -= fraglen;
-  }
-
-  query = memchr(path, '?', pathlen);
-  if(query) {
-    size_t qlen = fragment ? (size_t)(fragment - query) :
-      pathlen - (query - path);
-    pathlen -= qlen;
-    u->query_present = TRUE;
-    if(qlen > 1) {
-      if(flags & CURLU_URLENCODE) {
-        struct dynbuf enc;
-        curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
-        /* skip the leading question mark */
-        result = urlencode_str(&enc, query + 1, qlen - 1, TRUE, TRUE);
-        if(result)
-          goto fail;
-        u->query = curlx_dyn_ptr(&enc);
-      }
-      else {
-        u->query = Curl_memdup0(query + 1, qlen - 1);
-        if(!u->query) {
-          result = CURLUE_OUT_OF_MEMORY;
-          goto fail;
-        }
-      }
-    }
-    else {
-      /* single byte query */
-      u->query = curlx_strdup("");
-      if(!u->query) {
-        result = CURLUE_OUT_OF_MEMORY;
-        goto fail;
-      }
+  if(!result) {
+    const char *query = memchr(path, '?', pathlen);
+    if(query) {
+      size_t qlen = pathlen - (query - path);
+      result = handle_query(u, query, qlen, flags);
+      pathlen -= qlen;
    }
  }
-
-  if(pathlen && (flags & CURLU_URLENCODE)) {
-    struct dynbuf enc;
-    curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
-    result = urlencode_str(&enc, path, pathlen, TRUE, FALSE);
-    if(result)
-      goto fail;
-    pathlen = curlx_dyn_len(&enc);
-    path = u->path = curlx_dyn_ptr(&enc);
+  if(!result)
+    /* the fragment and query parts are trimmed off from the path */
+    result = handle_path(u, path, pathlen, flags);
+  if(!result) {
+    u->host = curlx_dyn_ptr(&host);
+    return CURLUE_OK;
  }
-
-  if(pathlen <= 1) {
-    /* there is no path left or just the slash, unset */
-    path = NULL;
-  }
-  else {
-    if(!u->path) {
-      u->path = Curl_memdup0(path, pathlen);
-      if(!u->path) {
-        result = CURLUE_OUT_OF_MEMORY;
-        goto fail;
-      }
-      path = u->path;
-    }
-    else if(flags & CURLU_URLENCODE)
-      /* it might have encoded more than just the path so cut it */
-      u->path[pathlen] = 0;
-
-    if(!(flags & CURLU_PATH_AS_IS)) {
-      /* remove ../ and ./ sequences according to RFC3986 */
-      char *dedot;
-      int err = dedotdotify(path, pathlen, &dedot);
-      if(err) {
-        result = CURLUE_OUT_OF_MEMORY;
-        goto fail;
-      }
-      if(dedot) {
-        curlx_free(u->path);
-        u->path = dedot;
-      }
-    }
-  }
-
-  u->host = curlx_dyn_ptr(&host);
-
-  return result;
 fail:
  curlx_dyn_free(&host);
  free_urlhandle(u);
--- a/tests/data/test1560
+++ b/tests/data/test1560
@ -4,6 +4,7 @@
 <keywords>
 unittest
 urlapi
+url
 </keywords>
 </info>