URL-API

See header file and man pages for API. All documented API details work and are tested in the 1560 test case. Closes #2842
2026-08-03 12:00:31 +03:00 · 2018-08-05 11:51:07 +02:00 · 2018-08-05 11:51:07 +02:00 · fb30ac5a2d
commit fb30ac5a2d
parent 17ca0ccff4
24 changed files with 2814 additions and 365 deletions
--- a/lib/Makefile.inc
+++ b/lib/Makefile.inc
@ -55,7 +55,7 @@ LIB_CFILES = file.c timeval.c base64.c hostip.c progress.c formdata.c   \
  curl_multibyte.c hostcheck.c conncache.c pipeline.c dotdot.c          \
  x509asn1.c http2.c smb.c curl_endian.c curl_des.c system_win32.c      \
  mime.c sha256.c setopt.c curl_path.c curl_ctype.c curl_range.c psl.c  \
-  doh.c
+  doh.c urlapi.c

 LIB_HFILES = arpa_telnet.h netrc.h file.h timeval.h hostip.h progress.h \
  formdata.h cookie.h http.h sendf.h ftp.h url.h dict.h if2ip.h         \
@ -75,7 +75,7 @@ LIB_HFILES = arpa_telnet.h netrc.h file.h timeval.h hostip.h progress.h \
  curl_setup_once.h multihandle.h setup-vms.h pipeline.h dotdot.h       \
  x509asn1.h http2.h sigpipe.h smb.h curl_endian.h curl_des.h           \
  curl_printf.h system_win32.h rand.h mime.h curl_sha256.h setopt.h     \
-  curl_path.h curl_ctype.h curl_range.h psl.h doh.h
+  curl_path.h curl_ctype.h curl_range.h psl.h doh.h urlapi-int.h

 LIB_RCFILES = libcurl.rc

--- a/lib/escape.c
+++ b/lib/escape.c
@ -5,7 +5,7 @@
 *                            | (__| |_| |  _ <| |___
 *                             \___|\___/|_| \_\_____|
 *
- * Copyright (C) 1998 - 2017, Daniel Stenberg, <daniel@haxx.se>, et al.
+ * Copyright (C) 1998 - 2018, Daniel Stenberg, <daniel@haxx.se>, et al.
 *
 * This software is licensed as described in the file COPYING, which
 * you should have received as part of this distribution. The terms
@ -41,7 +41,7 @@
   its behavior is altered by the current locale.
   See https://tools.ietf.org/html/rfc3986#section-2.3
 */
-static bool Curl_isunreserved(unsigned char in)
+bool Curl_isunreserved(unsigned char in)
 {
  switch(in) {
    case '0': case '1': case '2': case '3': case '4':
@ -141,6 +141,8 @@ char *curl_easy_escape(struct Curl_easy *data, const char *string,
 * Returns a pointer to a malloced string in *ostring with length given in
 * *olen. If length == 0, the length is assumed to be strlen(string).
 *
+ * 'data' can be set to NULL but then this function can't convert network
+ * data to host for non-ascii.
 */
 CURLcode Curl_urldecode(struct Curl_easy *data,
                        const char *string, size_t length,
@ -151,7 +153,7 @@ CURLcode Curl_urldecode(struct Curl_easy *data,
  char *ns = malloc(alloc);
  size_t strindex = 0;
  unsigned long hex;
-  CURLcode result;
+  CURLcode result = CURLE_OK;

  if(!ns)
    return CURLE_OUT_OF_MEMORY;
@ -171,11 +173,13 @@ CURLcode Curl_urldecode(struct Curl_easy *data,

      in = curlx_ultouc(hex); /* this long is never bigger than 255 anyway */

-      result = Curl_convert_from_network(data, (char *)&in, 1);
-      if(result) {
-        /* Curl_convert_from_network calls failf if unsuccessful */
-        free(ns);
-        return result;
+      if(data) {
+        result = Curl_convert_from_network(data, (char *)&in, 1);
+        if(result) {
+          /* Curl_convert_from_network calls failf if unsuccessful */
+          free(ns);
+          return result;
+        }
      }

      string += 2;
--- a/lib/escape.h
+++ b/lib/escape.h
@ -7,7 +7,7 @@
 *                            | (__| |_| |  _ <| |___
 *                             \___|\___/|_| \_\_____|
 *
- * Copyright (C) 1998 - 2011, Daniel Stenberg, <daniel@haxx.se>, et al.
+ * Copyright (C) 1998 - 2018, Daniel Stenberg, <daniel@haxx.se>, et al.
 *
 * This software is licensed as described in the file COPYING, which
 * you should have received as part of this distribution. The terms
@ -24,6 +24,7 @@
 /* Escape and unescape URL encoding in strings. The functions return a new
 * allocated string or NULL if an error occurred.  */

+bool Curl_isunreserved(unsigned char in);
 CURLcode Curl_urldecode(struct Curl_easy *data,
                        const char *string, size_t length,
                        char **ostring, size_t *olen,
--- a/lib/imap.c
+++ b/lib/imap.c
@ -159,7 +159,8 @@ const struct Curl_handler Curl_handler_imaps = {
  ZERO_NULL,                        /* connection_check */
  PORT_IMAPS,                       /* defport */
  CURLPROTO_IMAPS,                  /* protocol */
-  PROTOPT_CLOSEACTION | PROTOPT_SSL /* flags */
+  PROTOPT_CLOSEACTION | PROTOPT_SSL | /* flags */
+  PROTOPT_URLOPTIONS
 };
 #endif

--- a/lib/transfer.c
+++ b/lib/transfer.c
@ -75,6 +75,7 @@
 #include "http2.h"
 #include "mime.h"
 #include "strcase.h"
+#include "urlapi-int.h"

 /* The last 3 #include files should be in this order */
 #include "curl_printf.h"
@ -1454,311 +1455,6 @@ CURLcode Curl_posttransfer(struct Curl_easy *data)
  return CURLE_OK;
 }

-#ifndef CURL_DISABLE_HTTP
-/*
- * Find the separator at the end of the host name, or the '?' in cases like
- * http://www.url.com?id=2380
- */
-static const char *find_host_sep(const char *url)
-{
-  const char *sep;
-  const char *query;
-
-  /* Find the start of the hostname */
-  sep = strstr(url, "//");
-  if(!sep)
-    sep = url;
-  else
-    sep += 2;
-
-  query = strchr(sep, '?');
-  sep = strchr(sep, '/');
-
-  if(!sep)
-    sep = url + strlen(url);
-
-  if(!query)
-    query = url + strlen(url);
-
-  return sep < query ? sep : query;
-}
-
-/*
- * Decide in an encoding-independent manner whether a character in an
- * URL must be escaped. The same criterion must be used in strlen_url()
- * and strcpy_url().
- */
-static bool urlchar_needs_escaping(int c)
-{
-    return !(ISCNTRL(c) || ISSPACE(c) || ISGRAPH(c));
-}
-
-/*
- * strlen_url() returns the length of the given URL if the spaces within the
- * URL were properly URL encoded.
- * URL encoding should be skipped for host names, otherwise IDN resolution
- * will fail.
- */
-static size_t strlen_url(const char *url, bool relative)
-{
-  const unsigned char *ptr;
-  size_t newlen = 0;
-  bool left = TRUE; /* left side of the ? */
-  const unsigned char *host_sep = (const unsigned char *) url;
-
-  if(!relative)
-    host_sep = (const unsigned char *) find_host_sep(url);
-
-  for(ptr = (unsigned char *)url; *ptr; ptr++) {
-
-    if(ptr < host_sep) {
-      ++newlen;
-      continue;
-    }
-
-    switch(*ptr) {
-    case '?':
-      left = FALSE;
-      /* FALLTHROUGH */
-    default:
-      if(urlchar_needs_escaping(*ptr))
-        newlen += 2;
-      newlen++;
-      break;
-    case ' ':
-      if(left)
-        newlen += 3;
-      else
-        newlen++;
-      break;
-    }
-  }
-  return newlen;
-}
-
-/* strcpy_url() copies a url to a output buffer and URL-encodes the spaces in
- * the source URL accordingly.
- * URL encoding should be skipped for host names, otherwise IDN resolution
- * will fail.
- */
-static void strcpy_url(char *output, const char *url, bool relative)
-{
-  /* we must add this with whitespace-replacing */
-  bool left = TRUE;
-  const unsigned char *iptr;
-  char *optr = output;
-  const unsigned char *host_sep = (const unsigned char *) url;
-
-  if(!relative)
-    host_sep = (const unsigned char *) find_host_sep(url);
-
-  for(iptr = (unsigned char *)url;    /* read from here */
-      *iptr;         /* until zero byte */
-      iptr++) {
-
-    if(iptr < host_sep) {
-      *optr++ = *iptr;
-      continue;
-    }
-
-    switch(*iptr) {
-    case '?':
-      left = FALSE;
-      /* FALLTHROUGH */
-    default:
-      if(urlchar_needs_escaping(*iptr)) {
-        snprintf(optr, 4, "%%%02x", *iptr);
-        optr += 3;
-      }
-      else
-        *optr++=*iptr;
-      break;
-    case ' ':
-      if(left) {
-        *optr++='%'; /* add a '%' */
-        *optr++='2'; /* add a '2' */
-        *optr++='0'; /* add a '0' */
-      }
-      else
-        *optr++='+'; /* add a '+' here */
-      break;
-    }
-  }
-  *optr = 0; /* zero terminate output buffer */
-
-}
-
-/*
- * Returns true if the given URL is absolute (as opposed to relative)
- */
-static bool is_absolute_url(const char *url)
-{
-  char prot[16]; /* URL protocol string storage */
-  char letter;   /* used for a silly sscanf */
-
-  return (2 == sscanf(url, "%15[^?&/:]://%c", prot, &letter)) ? TRUE : FALSE;
-}
-
-/*
- * Concatenate a relative URL to a base URL making it absolute.
- * URL-encodes any spaces.
- * The returned pointer must be freed by the caller unless NULL
- * (returns NULL on out of memory).
- */
-static char *concat_url(const char *base, const char *relurl)
-{
-  /***
-   TRY to append this new path to the old URL
-   to the right of the host part. Oh crap, this is doomed to cause
-   problems in the future...
-  */
-  char *newest;
-  char *protsep;
-  char *pathsep;
-  size_t newlen;
-  bool host_changed = FALSE;
-
-  const char *useurl = relurl;
-  size_t urllen;
-
-  /* we must make our own copy of the URL to play with, as it may
-     point to read-only data */
-  char *url_clone = strdup(base);
-
-  if(!url_clone)
-    return NULL; /* skip out of this NOW */
-
-  /* protsep points to the start of the host name */
-  protsep = strstr(url_clone, "//");
-  if(!protsep)
-    protsep = url_clone;
-  else
-    protsep += 2; /* pass the slashes */
-
-  if('/' != relurl[0]) {
-    int level = 0;
-
-    /* First we need to find out if there's a ?-letter in the URL,
-       and cut it and the right-side of that off */
-    pathsep = strchr(protsep, '?');
-    if(pathsep)
-      *pathsep = 0;
-
-    /* we have a relative path to append to the last slash if there's one
-       available, or if the new URL is just a query string (starts with a
-       '?')  we append the new one at the end of the entire currently worked
-       out URL */
-    if(useurl[0] != '?') {
-      pathsep = strrchr(protsep, '/');
-      if(pathsep)
-        *pathsep = 0;
-    }
-
-    /* Check if there's any slash after the host name, and if so, remember
-       that position instead */
-    pathsep = strchr(protsep, '/');
-    if(pathsep)
-      protsep = pathsep + 1;
-    else
-      protsep = NULL;
-
-    /* now deal with one "./" or any amount of "../" in the newurl
-       and act accordingly */
-
-    if((useurl[0] == '.') && (useurl[1] == '/'))
-      useurl += 2; /* just skip the "./" */
-
-    while((useurl[0] == '.') &&
-          (useurl[1] == '.') &&
-          (useurl[2] == '/')) {
-      level++;
-      useurl += 3; /* pass the "../" */
-    }
-
-    if(protsep) {
-      while(level--) {
-        /* cut off one more level from the right of the original URL */
-        pathsep = strrchr(protsep, '/');
-        if(pathsep)
-          *pathsep = 0;
-        else {
-          *protsep = 0;
-          break;
-        }
-      }
-    }
-  }
-  else {
-    /* We got a new absolute path for this server */
-
-    if((relurl[0] == '/') && (relurl[1] == '/')) {
-      /* the new URL starts with //, just keep the protocol part from the
-         original one */
-      *protsep = 0;
-      useurl = &relurl[2]; /* we keep the slashes from the original, so we
-                              skip the new ones */
-      host_changed = TRUE;
-    }
-    else {
-      /* cut off the original URL from the first slash, or deal with URLs
-         without slash */
-      pathsep = strchr(protsep, '/');
-      if(pathsep) {
-        /* When people use badly formatted URLs, such as
-           "http://www.url.com?dir=/home/daniel" we must not use the first
-           slash, if there's a ?-letter before it! */
-        char *sep = strchr(protsep, '?');
-        if(sep && (sep < pathsep))
-          pathsep = sep;
-        *pathsep = 0;
-      }
-      else {
-        /* There was no slash. Now, since we might be operating on a badly
-           formatted URL, such as "http://www.url.com?id=2380" which doesn't
-           use a slash separator as it is supposed to, we need to check for a
-           ?-letter as well! */
-        pathsep = strchr(protsep, '?');
-        if(pathsep)
-          *pathsep = 0;
-      }
-    }
-  }
-
-  /* If the new part contains a space, this is a mighty stupid redirect
-     but we still make an effort to do "right". To the left of a '?'
-     letter we replace each space with %20 while it is replaced with '+'
-     on the right side of the '?' letter.
-  */
-  newlen = strlen_url(useurl, !host_changed);
-
-  urllen = strlen(url_clone);
-
-  newest = malloc(urllen + 1 + /* possible slash */
-                  newlen + 1 /* zero byte */);
-
-  if(!newest) {
-    free(url_clone); /* don't leak this */
-    return NULL;
-  }
-
-  /* copy over the root url part */
-  memcpy(newest, url_clone, urllen);
-
-  /* check if we need to append a slash */
-  if(('/' == useurl[0]) || (protsep && !*protsep) || ('?' == useurl[0]))
-    ;
-  else
-    newest[urllen++]='/';
-
-  /* then append the new piece on the right side */
-  strcpy_url(&newest[urllen], useurl, !host_changed);
-
-  free(url_clone);
-
-  return newest;
-}
-#endif /* CURL_DISABLE_HTTP */
-
 /*
 * Curl_follow() handles the URL redirect magic. Pass in the 'newurl' string
 * as given by the remote server and set up the new URL to request.
@ -1810,12 +1506,12 @@ CURLcode Curl_follow(struct Curl_easy *data,
    }
  }

-  if(!is_absolute_url(newurl)) {
+  if(!Curl_is_absolute_url(newurl, NULL, 8)) {
    /***
     *DANG* this is an RFC 2068 violation. The URL is supposed
     to be absolute and this doesn't seem to be that!
     */
-    char *absolute = concat_url(data->change.url, newurl);
+    char *absolute = Curl_concat_url(data->change.url, newurl);
    if(!absolute)
      return CURLE_OUT_OF_MEMORY;
    newurl = absolute;
@ -1824,7 +1520,7 @@ CURLcode Curl_follow(struct Curl_easy *data,
    /* The new URL MAY contain space or high byte values, that means a mighty
       stupid redirect URL but we still make an effort to do "right". */
    char *newest;
-    size_t newlen = strlen_url(newurl, FALSE);
+    size_t newlen = Curl_strlen_url(newurl, FALSE);

    /* This is an absolute URL, don't allow the custom port number */
    disallowport = TRUE;
@ -1833,7 +1529,7 @@ CURLcode Curl_follow(struct Curl_easy *data,
    if(!newest)
      return CURLE_OUT_OF_MEMORY;

-    strcpy_url(newest, newurl, FALSE); /* create a space-free URL */
+    Curl_strcpy_url(newest, newurl, FALSE); /* create a space-free URL */
    newurl = newest; /* use this instead now */

  }
--- a/lib/url.c
+++ b/lib/url.c
@ -1944,30 +1944,37 @@ static struct connectdata *allocate_conn(struct Curl_easy *data)
  return NULL;
 }

+/* returns the handdler if the given scheme is built-in */
+const struct Curl_handler *Curl_builtin_scheme(const char *scheme)
+{
+  const struct Curl_handler * const *pp;
+  const struct Curl_handler *p;
+  /* Scan protocol handler table and match against 'scheme'. The handler may
+     be changed later when the protocol specific setup function is called. */
+  for(pp = protocols; (p = *pp) != NULL; pp++)
+    if(strcasecompare(p->scheme, scheme))
+      /* Protocol found in table. Check if allowed */
+      return p;
+  return NULL; /* not found */
+}
+
+
 static CURLcode findprotocol(struct Curl_easy *data,
                             struct connectdata *conn,
                             const char *protostr)
 {
-  const struct Curl_handler * const *pp;
-  const struct Curl_handler *p;
+  const struct Curl_handler *p = Curl_builtin_scheme(protostr);

-  /* Scan protocol handler table and match against 'protostr' to set a few
-     variables based on the URL. Now that the handler may be changed later
-     when the protocol specific setup function is called. */
-  for(pp = protocols; (p = *pp) != NULL; pp++) {
-    if(strcasecompare(p->scheme, protostr)) {
-      /* Protocol found in table. Check if allowed */
-      if(!(data->set.allowed_protocols & p->protocol))
-        /* nope, get out */
-        break;
-
-      /* it is allowed for "normal" request, now do an extra check if this is
-         the result of a redirect */
-      if(data->state.this_is_a_follow &&
-         !(data->set.redir_protocols & p->protocol))
-        /* nope, get out */
-        break;
+  if(p && /* Protocol found in table. Check if allowed */
+     (data->set.allowed_protocols & p->protocol)) {

+    /* it is allowed for "normal" request, now do an extra check if this is
+       the result of a redirect */
+    if(data->state.this_is_a_follow &&
+       !(data->set.redir_protocols & p->protocol))
+      /* nope, get out */
+      ;
+    else {
      /* Perform setup complement if some. */
      conn->handler = conn->given = p;

@ -1976,7 +1983,6 @@ static CURLcode findprotocol(struct Curl_easy *data,
    }
  }

-
  /* The protocol was not found in the table, but we don't have to assign it
     to anything since it is already assigned to a dummy-struct in the
     create_conn() function when the connectdata struct is allocated. */
--- a/lib/url.h
+++ b/lib/url.h
@ -79,6 +79,8 @@ void Curl_getoff_all_pipelines(struct Curl_easy *data,

 CURLcode Curl_upkeep(struct conncache *conn_cache, void *data);

+const struct Curl_handler *Curl_builtin_scheme(const char *scheme);
+
 #define CURL_DEFAULT_PROXY_PORT 1080 /* default proxy port unless specified */
 #define CURL_DEFAULT_HTTPS_PROXY_PORT 443 /* default https proxy port unless
                                             specified */
--- a/lib/urlapi-int.h
+++ b/lib/urlapi-int.h
@ -0,0 +1,29 @@
+#ifndef HEADER_CURL_URLAPI_INT_H
+#define HEADER_CURL_URLAPI_INT_H
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2018, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at https://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+#include "curl_setup.h"
+bool Curl_is_absolute_url(const char *url, char *scheme, size_t buflen);
+char *Curl_concat_url(const char *base, const char *relurl);
+size_t Curl_strlen_url(const char *url, bool relative);
+void Curl_strcpy_url(char *output, const char *url, bool relative);
+#endif
--- a/lib/urlapi.c
+++ b/lib/urlapi.c