summaryrefslogtreecommitdiff
path: root/venv/lib/python3.11/site-packages/httpx/_urlparse.py
diff options
context:
space:
mode:
authorcyfraeviolae <cyfraeviolae>2024-04-03 03:10:44 -0400
committercyfraeviolae <cyfraeviolae>2024-04-03 03:10:44 -0400
commit6d7ba58f880be618ade07f8ea080fe8c4bf8a896 (patch)
treeb1c931051ffcebd2bd9d61d98d6233ffa289bbce /venv/lib/python3.11/site-packages/httpx/_urlparse.py
parent4f884c9abc32990b4061a1bb6997b4b37e58ea0b (diff)
venv
Diffstat (limited to 'venv/lib/python3.11/site-packages/httpx/_urlparse.py')
-rw-r--r--venv/lib/python3.11/site-packages/httpx/_urlparse.py502
1 files changed, 502 insertions, 0 deletions
diff --git a/venv/lib/python3.11/site-packages/httpx/_urlparse.py b/venv/lib/python3.11/site-packages/httpx/_urlparse.py
new file mode 100644
index 0000000..6a4b55b
--- /dev/null
+++ b/venv/lib/python3.11/site-packages/httpx/_urlparse.py
@@ -0,0 +1,502 @@
+"""
+An implementation of `urlparse` that provides URL validation and normalization
+as described by RFC3986.
+
+We rely on this implementation rather than the one in Python's stdlib, because:
+
+* It provides more complete URL validation.
+* It properly differentiates between an empty querystring and an absent querystring,
+ to distinguish URLs with a trailing '?'.
+* It handles scheme, hostname, port, and path normalization.
+* It supports IDNA hostnames, normalizing them to their encoded form.
+* The API supports passing individual components, as well as the complete URL string.
+
+Previously we relied on the excellent `rfc3986` package to handle URL parsing and
+validation, but this module provides a simpler alternative, with less indirection
+required.
+"""
+from __future__ import annotations
+
+import ipaddress
+import re
+import typing
+
+import idna
+
+from ._exceptions import InvalidURL
+
+MAX_URL_LENGTH = 65536
+
+# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3
+UNRESERVED_CHARACTERS = (
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
+)
+SUB_DELIMS = "!$&'()*+,;="
+
+PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")
+
+
+# {scheme}: (optional)
+# //{authority} (optional)
+# {path}
+# ?{query} (optional)
+# #{fragment} (optional)
+URL_REGEX = re.compile(
+ (
+ r"(?:(?P<scheme>{scheme}):)?"
+ r"(?://(?P<authority>{authority}))?"
+ r"(?P<path>{path})"
+ r"(?:\?(?P<query>{query}))?"
+ r"(?:#(?P<fragment>{fragment}))?"
+ ).format(
+ scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?",
+ authority="[^/?#]*",
+ path="[^?#]*",
+ query="[^#]*",
+ fragment=".*",
+ )
+)
+
+# {userinfo}@ (optional)
+# {host}
+# :{port} (optional)
+AUTHORITY_REGEX = re.compile(
+ (
+ r"(?:(?P<userinfo>{userinfo})@)?" r"(?P<host>{host})" r":?(?P<port>{port})?"
+ ).format(
+ userinfo=".*", # Any character sequence.
+ host="(\\[.*\\]|[^:@]*)", # Either any character sequence excluding ':' or '@',
+ # or an IPv6 address enclosed within square brackets.
+ port=".*", # Any character sequence.
+ )
+)
+
+
+# If we call urlparse with an individual component, then we need to regex
+# validate that component individually.
+# Note that we're duplicating the same strings as above. Shock! Horror!!
+COMPONENT_REGEX = {
+ "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"),
+ "authority": re.compile("[^/?#]*"),
+ "path": re.compile("[^?#]*"),
+ "query": re.compile("[^#]*"),
+ "fragment": re.compile(".*"),
+ "userinfo": re.compile("[^@]*"),
+ "host": re.compile("(\\[.*\\]|[^:]*)"),
+ "port": re.compile(".*"),
+}
+
+
+# We use these simple regexs as a first pass before handing off to
+# the stdlib 'ipaddress' module for IP address validation.
+IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$")
+IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$")
+
+
+class ParseResult(typing.NamedTuple):
+ scheme: str
+ userinfo: str
+ host: str
+ port: int | None
+ path: str
+ query: str | None
+ fragment: str | None
+
+ @property
+ def authority(self) -> str:
+ return "".join(
+ [
+ f"{self.userinfo}@" if self.userinfo else "",
+ f"[{self.host}]" if ":" in self.host else self.host,
+ f":{self.port}" if self.port is not None else "",
+ ]
+ )
+
+ @property
+ def netloc(self) -> str:
+ return "".join(
+ [
+ f"[{self.host}]" if ":" in self.host else self.host,
+ f":{self.port}" if self.port is not None else "",
+ ]
+ )
+
+ def copy_with(self, **kwargs: str | None) -> ParseResult:
+ if not kwargs:
+ return self
+
+ defaults = {
+ "scheme": self.scheme,
+ "authority": self.authority,
+ "path": self.path,
+ "query": self.query,
+ "fragment": self.fragment,
+ }
+ defaults.update(kwargs)
+ return urlparse("", **defaults)
+
+ def __str__(self) -> str:
+ authority = self.authority
+ return "".join(
+ [
+ f"{self.scheme}:" if self.scheme else "",
+ f"//{authority}" if authority else "",
+ self.path,
+ f"?{self.query}" if self.query is not None else "",
+ f"#{self.fragment}" if self.fragment is not None else "",
+ ]
+ )
+
+
+def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
+ # Initial basic checks on allowable URLs.
+ # ---------------------------------------
+
+ # Hard limit the maximum allowable URL length.
+ if len(url) > MAX_URL_LENGTH:
+ raise InvalidURL("URL too long")
+
+ # If a URL includes any ASCII control characters including \t, \r, \n,
+ # then treat it as invalid.
+ if any(char.isascii() and not char.isprintable() for char in url):
+ raise InvalidURL("Invalid non-printable ASCII character in URL")
+
+ # Some keyword arguments require special handling.
+ # ------------------------------------------------
+
+ # Coerce "port" to a string, if it is provided as an integer.
+ if "port" in kwargs:
+ port = kwargs["port"]
+ kwargs["port"] = str(port) if isinstance(port, int) else port
+
+ # Replace "netloc" with "host and "port".
+ if "netloc" in kwargs:
+ netloc = kwargs.pop("netloc") or ""
+ kwargs["host"], _, kwargs["port"] = netloc.partition(":")
+
+ # Replace "username" and/or "password" with "userinfo".
+ if "username" in kwargs or "password" in kwargs:
+ username = quote(kwargs.pop("username", "") or "")
+ password = quote(kwargs.pop("password", "") or "")
+ kwargs["userinfo"] = f"{username}:{password}" if password else username
+
+ # Replace "raw_path" with "path" and "query".
+ if "raw_path" in kwargs:
+ raw_path = kwargs.pop("raw_path") or ""
+ kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?")
+ if not seperator:
+ kwargs["query"] = None
+
+ # Ensure that IPv6 "host" addresses are always escaped with "[...]".
+ if "host" in kwargs:
+ host = kwargs.get("host") or ""
+ if ":" in host and not (host.startswith("[") and host.endswith("]")):
+ kwargs["host"] = f"[{host}]"
+
+ # If any keyword arguments are provided, ensure they are valid.
+ # -------------------------------------------------------------
+
+ for key, value in kwargs.items():
+ if value is not None:
+ if len(value) > MAX_URL_LENGTH:
+ raise InvalidURL(f"URL component '{key}' too long")
+
+ # If a component includes any ASCII control characters including \t, \r, \n,
+ # then treat it as invalid.
+ if any(char.isascii() and not char.isprintable() for char in value):
+ raise InvalidURL(
+ f"Invalid non-printable ASCII character in URL component '{key}'"
+ )
+
+ # Ensure that keyword arguments match as a valid regex.
+ if not COMPONENT_REGEX[key].fullmatch(value):
+ raise InvalidURL(f"Invalid URL component '{key}'")
+
+ # The URL_REGEX will always match, but may have empty components.
+ url_match = URL_REGEX.match(url)
+ assert url_match is not None
+ url_dict = url_match.groupdict()
+
+ # * 'scheme', 'authority', and 'path' may be empty strings.
+ # * 'query' may be 'None', indicating no trailing "?" portion.
+ # Any string including the empty string, indicates a trailing "?".
+ # * 'fragment' may be 'None', indicating no trailing "#" portion.
+ # Any string including the empty string, indicates a trailing "#".
+ scheme = kwargs.get("scheme", url_dict["scheme"]) or ""
+ authority = kwargs.get("authority", url_dict["authority"]) or ""
+ path = kwargs.get("path", url_dict["path"]) or ""
+ query = kwargs.get("query", url_dict["query"])
+ fragment = kwargs.get("fragment", url_dict["fragment"])
+
+ # The AUTHORITY_REGEX will always match, but may have empty components.
+ authority_match = AUTHORITY_REGEX.match(authority)
+ assert authority_match is not None
+ authority_dict = authority_match.groupdict()
+
+ # * 'userinfo' and 'host' may be empty strings.
+ # * 'port' may be 'None'.
+ userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or ""
+ host = kwargs.get("host", authority_dict["host"]) or ""
+ port = kwargs.get("port", authority_dict["port"])
+
+ # Normalize and validate each component.
+ # We end up with a parsed representation of the URL,
+ # with components that are plain ASCII bytestrings.
+ parsed_scheme: str = scheme.lower()
+ parsed_userinfo: str = quote(userinfo, safe=SUB_DELIMS + ":")
+ parsed_host: str = encode_host(host)
+ parsed_port: int | None = normalize_port(port, scheme)
+
+ has_scheme = parsed_scheme != ""
+ has_authority = (
+ parsed_userinfo != "" or parsed_host != "" or parsed_port is not None
+ )
+ validate_path(path, has_scheme=has_scheme, has_authority=has_authority)
+ if has_authority:
+ path = normalize_path(path)
+
+ # The GEN_DELIMS set is... : / ? # [ ] @
+ # These do not need to be percent-quoted unless they serve as delimiters for the
+ # specific component.
+
+ # For 'path' we need to drop ? and # from the GEN_DELIMS set.
+ parsed_path: str = quote(path, safe=SUB_DELIMS + ":/[]@")
+ # For 'query' we need to drop '#' from the GEN_DELIMS set.
+ parsed_query: str | None = (
+ None if query is None else quote(query, safe=SUB_DELIMS + ":/?[]@")
+ )
+ # For 'fragment' we can include all of the GEN_DELIMS set.
+ parsed_fragment: str | None = (
+ None if fragment is None else quote(fragment, safe=SUB_DELIMS + ":/?#[]@")
+ )
+
+ # The parsed ASCII bytestrings are our canonical form.
+ # All properties of the URL are derived from these.
+ return ParseResult(
+ parsed_scheme,
+ parsed_userinfo,
+ parsed_host,
+ parsed_port,
+ parsed_path,
+ parsed_query,
+ parsed_fragment,
+ )
+
+
+def encode_host(host: str) -> str:
+ if not host:
+ return ""
+
+ elif IPv4_STYLE_HOSTNAME.match(host):
+ # Validate IPv4 hostnames like #.#.#.#
+ #
+ # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
+ #
+ # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
+ try:
+ ipaddress.IPv4Address(host)
+ except ipaddress.AddressValueError:
+ raise InvalidURL(f"Invalid IPv4 address: {host!r}")
+ return host
+
+ elif IPv6_STYLE_HOSTNAME.match(host):
+ # Validate IPv6 hostnames like [...]
+ #
+ # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
+ #
+ # "A host identified by an Internet Protocol literal address, version 6
+ # [RFC3513] or later, is distinguished by enclosing the IP literal
+ # within square brackets ("[" and "]"). This is the only place where
+ # square bracket characters are allowed in the URI syntax."
+ try:
+ ipaddress.IPv6Address(host[1:-1])
+ except ipaddress.AddressValueError:
+ raise InvalidURL(f"Invalid IPv6 address: {host!r}")
+ return host[1:-1]
+
+ elif host.isascii():
+ # Regular ASCII hostnames
+ #
+ # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
+ #
+ # reg-name = *( unreserved / pct-encoded / sub-delims )
+ return quote(host.lower(), safe=SUB_DELIMS)
+
+ # IDNA hostnames
+ try:
+ return idna.encode(host.lower()).decode("ascii")
+ except idna.IDNAError:
+ raise InvalidURL(f"Invalid IDNA hostname: {host!r}")
+
+
+def normalize_port(port: str | int | None, scheme: str) -> int | None:
+ # From https://tools.ietf.org/html/rfc3986#section-3.2.3
+ #
+ # "A scheme may define a default port. For example, the "http" scheme
+ # defines a default port of "80", corresponding to its reserved TCP
+ # port number. The type of port designated by the port number (e.g.,
+ # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and
+ # normalizers should omit the port component and its ":" delimiter if
+ # port is empty or if its value would be the same as that of the
+ # scheme's default."
+ if port is None or port == "":
+ return None
+
+ try:
+ port_as_int = int(port)
+ except ValueError:
+ raise InvalidURL(f"Invalid port: {port!r}")
+
+ # See https://url.spec.whatwg.org/#url-miscellaneous
+ default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get(
+ scheme
+ )
+ if port_as_int == default_port:
+ return None
+ return port_as_int
+
+
+def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None:
+ """
+ Path validation rules that depend on if the URL contains
+ a scheme or authority component.
+
+ See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3
+ """
+ if has_authority:
+ # If a URI contains an authority component, then the path component
+ # must either be empty or begin with a slash ("/") character."
+ if path and not path.startswith("/"):
+ raise InvalidURL("For absolute URLs, path must be empty or begin with '/'")
+ else:
+ # If a URI does not contain an authority component, then the path cannot begin
+ # with two slash characters ("//").
+ if path.startswith("//"):
+ raise InvalidURL(
+ "URLs with no authority component cannot have a path starting with '//'"
+ )
+ # In addition, a URI reference (Section 4.1) may be a relative-path reference,
+ # in which case the first path segment cannot contain a colon (":") character.
+ if path.startswith(":") and not has_scheme:
+ raise InvalidURL(
+ "URLs with no scheme component cannot have a path starting with ':'"
+ )
+
+
+def normalize_path(path: str) -> str:
+ """
+ Drop "." and ".." segments from a URL path.
+
+ For example:
+
+ normalize_path("/path/./to/somewhere/..") == "/path/to"
+ """
+ # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4
+ components = path.split("/")
+ output: list[str] = []
+ for component in components:
+ if component == ".":
+ pass
+ elif component == "..":
+ if output and output != [""]:
+ output.pop()
+ else:
+ output.append(component)
+ return "/".join(output)
+
+
+def percent_encode(char: str) -> str:
+ """
+ Replace a single character with the percent-encoded representation.
+
+ Characters outside the ASCII range are represented with their a percent-encoded
+ representation of their UTF-8 byte sequence.
+
+ For example:
+
+ percent_encode(" ") == "%20"
+ """
+ return "".join([f"%{byte:02x}" for byte in char.encode("utf-8")]).upper()
+
+
+def is_safe(string: str, safe: str = "/") -> bool:
+ """
+ Determine if a given string is already quote-safe.
+ """
+ NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe + "%"
+
+ # All characters must already be non-escaping or '%'
+ for char in string:
+ if char not in NON_ESCAPED_CHARS:
+ return False
+
+ return True
+
+
+def percent_encoded(string: str, safe: str = "/") -> str:
+ """
+ Use percent-encoding to quote a string.
+ """
+ if is_safe(string, safe=safe):
+ return string
+
+ NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe
+ return "".join(
+ [char if char in NON_ESCAPED_CHARS else percent_encode(char) for char in string]
+ )
+
+
+def quote(string: str, safe: str = "/") -> str:
+ """
+ Use percent-encoding to quote a string, omitting existing '%xx' escape sequences.
+
+ See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1
+
+ * `string`: The string to be percent-escaped.
+ * `safe`: A string containing characters that may be treated as safe, and do not
+ need to be escaped. Unreserved characters are always treated as safe.
+ See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3
+ """
+ parts = []
+ current_position = 0
+ for match in re.finditer(PERCENT_ENCODED_REGEX, string):
+ start_position, end_position = match.start(), match.end()
+ matched_text = match.group(0)
+ # Add any text up to the '%xx' escape sequence.
+ if start_position != current_position:
+ leading_text = string[current_position:start_position]
+ parts.append(percent_encoded(leading_text, safe=safe))
+
+ # Add the '%xx' escape sequence.
+ parts.append(matched_text)
+ current_position = end_position
+
+ # Add any text after the final '%xx' escape sequence.
+ if current_position != len(string):
+ trailing_text = string[current_position:]
+ parts.append(percent_encoded(trailing_text, safe=safe))
+
+ return "".join(parts)
+
+
+def urlencode(items: list[tuple[str, str]]) -> str:
+ """
+ We can use a much simpler version of the stdlib urlencode here because
+ we don't need to handle a bunch of different typing cases, such as bytes vs str.
+
+ https://github.com/python/cpython/blob/b2f7b2ef0b5421e01efb8c7bee2ef95d3bab77eb/Lib/urllib/parse.py#L926
+
+ Note that we use '%20' encoding for spaces. and '%2F for '/'.
+ This is slightly different than `requests`, but is the behaviour that browsers use.
+
+ See
+ - https://github.com/encode/httpx/issues/2536
+ - https://github.com/encode/httpx/issues/2721
+ - https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlencode
+ """
+ return "&".join(
+ [
+ percent_encoded(k, safe="") + "=" + percent_encoded(v, safe="")
+ for k, v in items
+ ]
+ )