From 12cf076118570eebbff08c6b3090e0d4798447a1 Mon Sep 17 00:00:00 2001 From: cyfraeviolae Date: Wed, 3 Apr 2024 03:17:55 -0400 Subject: no venv --- .../python3.11/site-packages/httpx/_urlparse.py | 502 --------------------- 1 file changed, 502 deletions(-) delete mode 100644 venv/lib/python3.11/site-packages/httpx/_urlparse.py (limited to 'venv/lib/python3.11/site-packages/httpx/_urlparse.py') diff --git a/venv/lib/python3.11/site-packages/httpx/_urlparse.py b/venv/lib/python3.11/site-packages/httpx/_urlparse.py deleted file mode 100644 index 6a4b55b..0000000 --- a/venv/lib/python3.11/site-packages/httpx/_urlparse.py +++ /dev/null @@ -1,502 +0,0 @@ -""" -An implementation of `urlparse` that provides URL validation and normalization -as described by RFC3986. - -We rely on this implementation rather than the one in Python's stdlib, because: - -* It provides more complete URL validation. -* It properly differentiates between an empty querystring and an absent querystring, - to distinguish URLs with a trailing '?'. -* It handles scheme, hostname, port, and path normalization. -* It supports IDNA hostnames, normalizing them to their encoded form. -* The API supports passing individual components, as well as the complete URL string. - -Previously we relied on the excellent `rfc3986` package to handle URL parsing and -validation, but this module provides a simpler alternative, with less indirection -required. -""" -from __future__ import annotations - -import ipaddress -import re -import typing - -import idna - -from ._exceptions import InvalidURL - -MAX_URL_LENGTH = 65536 - -# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3 -UNRESERVED_CHARACTERS = ( - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~" -) -SUB_DELIMS = "!$&'()*+,;=" - -PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}") - - -# {scheme}: (optional) -# //{authority} (optional) -# {path} -# ?{query} (optional) -# #{fragment} (optional) -URL_REGEX = re.compile( - ( - r"(?:(?P{scheme}):)?" - r"(?://(?P{authority}))?" - r"(?P{path})" - r"(?:\?(?P{query}))?" - r"(?:#(?P{fragment}))?" - ).format( - scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?", - authority="[^/?#]*", - path="[^?#]*", - query="[^#]*", - fragment=".*", - ) -) - -# {userinfo}@ (optional) -# {host} -# :{port} (optional) -AUTHORITY_REGEX = re.compile( - ( - r"(?:(?P{userinfo})@)?" r"(?P{host})" r":?(?P{port})?" - ).format( - userinfo=".*", # Any character sequence. - host="(\\[.*\\]|[^:@]*)", # Either any character sequence excluding ':' or '@', - # or an IPv6 address enclosed within square brackets. - port=".*", # Any character sequence. - ) -) - - -# If we call urlparse with an individual component, then we need to regex -# validate that component individually. -# Note that we're duplicating the same strings as above. Shock! Horror!! -COMPONENT_REGEX = { - "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"), - "authority": re.compile("[^/?#]*"), - "path": re.compile("[^?#]*"), - "query": re.compile("[^#]*"), - "fragment": re.compile(".*"), - "userinfo": re.compile("[^@]*"), - "host": re.compile("(\\[.*\\]|[^:]*)"), - "port": re.compile(".*"), -} - - -# We use these simple regexs as a first pass before handing off to -# the stdlib 'ipaddress' module for IP address validation. -IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$") -IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$") - - -class ParseResult(typing.NamedTuple): - scheme: str - userinfo: str - host: str - port: int | None - path: str - query: str | None - fragment: str | None - - @property - def authority(self) -> str: - return "".join( - [ - f"{self.userinfo}@" if self.userinfo else "", - f"[{self.host}]" if ":" in self.host else self.host, - f":{self.port}" if self.port is not None else "", - ] - ) - - @property - def netloc(self) -> str: - return "".join( - [ - f"[{self.host}]" if ":" in self.host else self.host, - f":{self.port}" if self.port is not None else "", - ] - ) - - def copy_with(self, **kwargs: str | None) -> ParseResult: - if not kwargs: - return self - - defaults = { - "scheme": self.scheme, - "authority": self.authority, - "path": self.path, - "query": self.query, - "fragment": self.fragment, - } - defaults.update(kwargs) - return urlparse("", **defaults) - - def __str__(self) -> str: - authority = self.authority - return "".join( - [ - f"{self.scheme}:" if self.scheme else "", - f"//{authority}" if authority else "", - self.path, - f"?{self.query}" if self.query is not None else "", - f"#{self.fragment}" if self.fragment is not None else "", - ] - ) - - -def urlparse(url: str = "", **kwargs: str | None) -> ParseResult: - # Initial basic checks on allowable URLs. - # --------------------------------------- - - # Hard limit the maximum allowable URL length. - if len(url) > MAX_URL_LENGTH: - raise InvalidURL("URL too long") - - # If a URL includes any ASCII control characters including \t, \r, \n, - # then treat it as invalid. - if any(char.isascii() and not char.isprintable() for char in url): - raise InvalidURL("Invalid non-printable ASCII character in URL") - - # Some keyword arguments require special handling. - # ------------------------------------------------ - - # Coerce "port" to a string, if it is provided as an integer. - if "port" in kwargs: - port = kwargs["port"] - kwargs["port"] = str(port) if isinstance(port, int) else port - - # Replace "netloc" with "host and "port". - if "netloc" in kwargs: - netloc = kwargs.pop("netloc") or "" - kwargs["host"], _, kwargs["port"] = netloc.partition(":") - - # Replace "username" and/or "password" with "userinfo". - if "username" in kwargs or "password" in kwargs: - username = quote(kwargs.pop("username", "") or "") - password = quote(kwargs.pop("password", "") or "") - kwargs["userinfo"] = f"{username}:{password}" if password else username - - # Replace "raw_path" with "path" and "query". - if "raw_path" in kwargs: - raw_path = kwargs.pop("raw_path") or "" - kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?") - if not seperator: - kwargs["query"] = None - - # Ensure that IPv6 "host" addresses are always escaped with "[...]". - if "host" in kwargs: - host = kwargs.get("host") or "" - if ":" in host and not (host.startswith("[") and host.endswith("]")): - kwargs["host"] = f"[{host}]" - - # If any keyword arguments are provided, ensure they are valid. - # ------------------------------------------------------------- - - for key, value in kwargs.items(): - if value is not None: - if len(value) > MAX_URL_LENGTH: - raise InvalidURL(f"URL component '{key}' too long") - - # If a component includes any ASCII control characters including \t, \r, \n, - # then treat it as invalid. - if any(char.isascii() and not char.isprintable() for char in value): - raise InvalidURL( - f"Invalid non-printable ASCII character in URL component '{key}'" - ) - - # Ensure that keyword arguments match as a valid regex. - if not COMPONENT_REGEX[key].fullmatch(value): - raise InvalidURL(f"Invalid URL component '{key}'") - - # The URL_REGEX will always match, but may have empty components. - url_match = URL_REGEX.match(url) - assert url_match is not None - url_dict = url_match.groupdict() - - # * 'scheme', 'authority', and 'path' may be empty strings. - # * 'query' may be 'None', indicating no trailing "?" portion. - # Any string including the empty string, indicates a trailing "?". - # * 'fragment' may be 'None', indicating no trailing "#" portion. - # Any string including the empty string, indicates a trailing "#". - scheme = kwargs.get("scheme", url_dict["scheme"]) or "" - authority = kwargs.get("authority", url_dict["authority"]) or "" - path = kwargs.get("path", url_dict["path"]) or "" - query = kwargs.get("query", url_dict["query"]) - fragment = kwargs.get("fragment", url_dict["fragment"]) - - # The AUTHORITY_REGEX will always match, but may have empty components. - authority_match = AUTHORITY_REGEX.match(authority) - assert authority_match is not None - authority_dict = authority_match.groupdict() - - # * 'userinfo' and 'host' may be empty strings. - # * 'port' may be 'None'. - userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or "" - host = kwargs.get("host", authority_dict["host"]) or "" - port = kwargs.get("port", authority_dict["port"]) - - # Normalize and validate each component. - # We end up with a parsed representation of the URL, - # with components that are plain ASCII bytestrings. - parsed_scheme: str = scheme.lower() - parsed_userinfo: str = quote(userinfo, safe=SUB_DELIMS + ":") - parsed_host: str = encode_host(host) - parsed_port: int | None = normalize_port(port, scheme) - - has_scheme = parsed_scheme != "" - has_authority = ( - parsed_userinfo != "" or parsed_host != "" or parsed_port is not None - ) - validate_path(path, has_scheme=has_scheme, has_authority=has_authority) - if has_authority: - path = normalize_path(path) - - # The GEN_DELIMS set is... : / ? # [ ] @ - # These do not need to be percent-quoted unless they serve as delimiters for the - # specific component. - - # For 'path' we need to drop ? and # from the GEN_DELIMS set. - parsed_path: str = quote(path, safe=SUB_DELIMS + ":/[]@") - # For 'query' we need to drop '#' from the GEN_DELIMS set. - parsed_query: str | None = ( - None if query is None else quote(query, safe=SUB_DELIMS + ":/?[]@") - ) - # For 'fragment' we can include all of the GEN_DELIMS set. - parsed_fragment: str | None = ( - None if fragment is None else quote(fragment, safe=SUB_DELIMS + ":/?#[]@") - ) - - # The parsed ASCII bytestrings are our canonical form. - # All properties of the URL are derived from these. - return ParseResult( - parsed_scheme, - parsed_userinfo, - parsed_host, - parsed_port, - parsed_path, - parsed_query, - parsed_fragment, - ) - - -def encode_host(host: str) -> str: - if not host: - return "" - - elif IPv4_STYLE_HOSTNAME.match(host): - # Validate IPv4 hostnames like #.#.#.# - # - # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 - # - # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet - try: - ipaddress.IPv4Address(host) - except ipaddress.AddressValueError: - raise InvalidURL(f"Invalid IPv4 address: {host!r}") - return host - - elif IPv6_STYLE_HOSTNAME.match(host): - # Validate IPv6 hostnames like [...] - # - # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 - # - # "A host identified by an Internet Protocol literal address, version 6 - # [RFC3513] or later, is distinguished by enclosing the IP literal - # within square brackets ("[" and "]"). This is the only place where - # square bracket characters are allowed in the URI syntax." - try: - ipaddress.IPv6Address(host[1:-1]) - except ipaddress.AddressValueError: - raise InvalidURL(f"Invalid IPv6 address: {host!r}") - return host[1:-1] - - elif host.isascii(): - # Regular ASCII hostnames - # - # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 - # - # reg-name = *( unreserved / pct-encoded / sub-delims ) - return quote(host.lower(), safe=SUB_DELIMS) - - # IDNA hostnames - try: - return idna.encode(host.lower()).decode("ascii") - except idna.IDNAError: - raise InvalidURL(f"Invalid IDNA hostname: {host!r}") - - -def normalize_port(port: str | int | None, scheme: str) -> int | None: - # From https://tools.ietf.org/html/rfc3986#section-3.2.3 - # - # "A scheme may define a default port. For example, the "http" scheme - # defines a default port of "80", corresponding to its reserved TCP - # port number. The type of port designated by the port number (e.g., - # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and - # normalizers should omit the port component and its ":" delimiter if - # port is empty or if its value would be the same as that of the - # scheme's default." - if port is None or port == "": - return None - - try: - port_as_int = int(port) - except ValueError: - raise InvalidURL(f"Invalid port: {port!r}") - - # See https://url.spec.whatwg.org/#url-miscellaneous - default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get( - scheme - ) - if port_as_int == default_port: - return None - return port_as_int - - -def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None: - """ - Path validation rules that depend on if the URL contains - a scheme or authority component. - - See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3 - """ - if has_authority: - # If a URI contains an authority component, then the path component - # must either be empty or begin with a slash ("/") character." - if path and not path.startswith("/"): - raise InvalidURL("For absolute URLs, path must be empty or begin with '/'") - else: - # If a URI does not contain an authority component, then the path cannot begin - # with two slash characters ("//"). - if path.startswith("//"): - raise InvalidURL( - "URLs with no authority component cannot have a path starting with '//'" - ) - # In addition, a URI reference (Section 4.1) may be a relative-path reference, - # in which case the first path segment cannot contain a colon (":") character. - if path.startswith(":") and not has_scheme: - raise InvalidURL( - "URLs with no scheme component cannot have a path starting with ':'" - ) - - -def normalize_path(path: str) -> str: - """ - Drop "." and ".." segments from a URL path. - - For example: - - normalize_path("/path/./to/somewhere/..") == "/path/to" - """ - # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4 - components = path.split("/") - output: list[str] = [] - for component in components: - if component == ".": - pass - elif component == "..": - if output and output != [""]: - output.pop() - else: - output.append(component) - return "/".join(output) - - -def percent_encode(char: str) -> str: - """ - Replace a single character with the percent-encoded representation. - - Characters outside the ASCII range are represented with their a percent-encoded - representation of their UTF-8 byte sequence. - - For example: - - percent_encode(" ") == "%20" - """ - return "".join([f"%{byte:02x}" for byte in char.encode("utf-8")]).upper() - - -def is_safe(string: str, safe: str = "/") -> bool: - """ - Determine if a given string is already quote-safe. - """ - NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe + "%" - - # All characters must already be non-escaping or '%' - for char in string: - if char not in NON_ESCAPED_CHARS: - return False - - return True - - -def percent_encoded(string: str, safe: str = "/") -> str: - """ - Use percent-encoding to quote a string. - """ - if is_safe(string, safe=safe): - return string - - NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe - return "".join( - [char if char in NON_ESCAPED_CHARS else percent_encode(char) for char in string] - ) - - -def quote(string: str, safe: str = "/") -> str: - """ - Use percent-encoding to quote a string, omitting existing '%xx' escape sequences. - - See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1 - - * `string`: The string to be percent-escaped. - * `safe`: A string containing characters that may be treated as safe, and do not - need to be escaped. Unreserved characters are always treated as safe. - See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3 - """ - parts = [] - current_position = 0 - for match in re.finditer(PERCENT_ENCODED_REGEX, string): - start_position, end_position = match.start(), match.end() - matched_text = match.group(0) - # Add any text up to the '%xx' escape sequence. - if start_position != current_position: - leading_text = string[current_position:start_position] - parts.append(percent_encoded(leading_text, safe=safe)) - - # Add the '%xx' escape sequence. - parts.append(matched_text) - current_position = end_position - - # Add any text after the final '%xx' escape sequence. - if current_position != len(string): - trailing_text = string[current_position:] - parts.append(percent_encoded(trailing_text, safe=safe)) - - return "".join(parts) - - -def urlencode(items: list[tuple[str, str]]) -> str: - """ - We can use a much simpler version of the stdlib urlencode here because - we don't need to handle a bunch of different typing cases, such as bytes vs str. - - https://github.com/python/cpython/blob/b2f7b2ef0b5421e01efb8c7bee2ef95d3bab77eb/Lib/urllib/parse.py#L926 - - Note that we use '%20' encoding for spaces. and '%2F for '/'. - This is slightly different than `requests`, but is the behaviour that browsers use. - - See - - https://github.com/encode/httpx/issues/2536 - - https://github.com/encode/httpx/issues/2721 - - https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlencode - """ - return "&".join( - [ - percent_encoded(k, safe="") + "=" + percent_encoded(v, safe="") - for k, v in items - ] - ) -- cgit v1.2.3