Source code for apeye.url

#!/usr/bin/env python
#
#  url.py
"""
:mod:`pathlib`-like approach to URLs.

.. versionchanged:: 1.0.0

	:class:`~apeye.slumber_url.SlumberURL` and :class:`~apeye.requests_url.RequestsURL`
	moved to :mod:`apeye.slumber_url` and :mod:`apeye.requests_url` respectively.
"""
#
#  Copyright © 2020-2021 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU Lesser General Public License as published by
#  the Free Software Foundation; either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#  GNU Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#
#  Based on the "pathlib" module from CPython.
#  Licensed under the Python Software Foundation License Version 2.
#  Copyright © 2001-2020 Python Software Foundation. All rights reserved.
#  Copyright © 2000 BeOpen.com. All rights reserved.
#  Copyright © 1995-2000 Corporation for National Research Initiatives. All rights reserved.
#  Copyright © 1991-1995 Stichting Mathematisch Centrum. All rights reserved.
#
#  Based on Slumber <https://slumber.readthedocs.io>
#  Copyright (c) 2011 Donald Stufft
#  Licensed under the 2-clause BSD License
#
#  Some docstrings from Requests <https://requests.readthedocs.io>
#  Copyright 2019 Kenneth Reitz
#  Licensed under the Apache License, Version 2.0
#

# stdlib
import ipaddress
import os
import pathlib
import re
from operator import attrgetter
from typing import (
		TYPE_CHECKING,
		Any,
		Dict,
		Iterable,
		List,
		Mapping,
		NamedTuple,
		Optional,
		Tuple,
		Type,
		TypeVar,
		Union
		)
from urllib.parse import ParseResult, parse_qs, urlencode, urlparse, urlunparse

# 3rd party
from domdf_python_tools.doctools import prettify_docstrings
from domdf_python_tools.typing import PathLike

# this package
from apeye import _tld

if TYPE_CHECKING:
	# stdlib
	from typing import NoReturn

__all__ = ["URL", "URLPath", "Domain", "URLType", "URLPathType"]

URLType = TypeVar("URLType", bound="URL")

URLPathType = TypeVar("URLPathType", bound="URLPath")
"""
.. versionadded:: 1.1.0
"""


[docs]@prettify_docstrings class URLPath(pathlib.PurePosixPath): """ Represents the path part of a URL. Subclass of :class:`pathlib.PurePosixPath` that provides a subset of its methods. .. versionchanged:: 1.1.0 Implemented :meth:`~.URLPath.is_absolute`, :meth:`~.URLPath.joinpath`, :meth:`~.URLPath.relative_to`, :meth:`~.pathlib.PurePath.match`, ``anchor``, ``drive``, and support for rich comparisons (``<``, ``<=``, ``>`` and ``>=``), which previously raised :exc:`NotImplementedError`. """
[docs] def __str__(self) -> str: """ Return the string representation of the path, suitable for passing to system calls. """ try: return self._str # type: ignore except AttributeError: self._str = self._format_parsed_parts(self._drv, self._root, self._parts) or '' # type: ignore return self._str
[docs] def __repr__(self): return super().__repr__()
@classmethod def _format_parsed_parts(cls, drv, root, parts): if drv or root: return drv + root + pathlib._posix_flavour.join(parts[1:]) # type: ignore else: return pathlib._posix_flavour.join(parts) # type: ignore
[docs] def is_absolute(self) -> bool: """ Returns whether the path is absolute (i.e. starts with ``/``). .. versionadded:: 1.1.0 previously raised :exc:`NotImplementedError`. """ return self.root == '/'
[docs] def joinpath(self: URLPathType, *args) -> URLPathType: """ Combine this :class:`~.URLPath` with one or several arguments. .. versionadded:: 1.1.0 previously raised :exc:`NotImplementedError`. :returns: A new :class:`~.URLPath` representing either a subpath (if all arguments are relative paths) or a totally different path (if one of the arguments is absolute). """ return super().joinpath(*args)
[docs] def relative_to(self: URLPathType, *other: PathLike) -> URLPathType: r""" Returns the relative path to another path identified by the passed arguments. The arguments are joined together to form a single path, and therefore the following behave identically: .. code-block:: pycon >>> URLPath("/news/sport").relative_to("/", "news") URLPath('sport') >>> URLPath("/news/sport").relative_to("/news") URLPath('sport') .. versionadded:: 1.1.0 previously raised :exc:`NotImplementedError`. :param \*other: :raises ValueError: if the operation is not possible (because this is not a subpath of the other path) .. seealso:: :meth:`~.URL.relative_to`, which is recommended when constructing a relative path from a :class:`~URL`. This method cannot correctly handle some cases, such as: .. code-block:: pycon >>> URL("https://github.com/domdfcoding").path.relative_to(URL("https://github.com").path) Traceback (most recent call last): ValueError: '/domdfcoding' does not start with '' Since ``URL("https://github.com").path`` is ``URLPath('')``. Instead, use: >>> URL("https://github.com/domdfcoding").relative_to(URL("https://github.com")) URLPath('domdfcoding') """ return super().relative_to(*other)
def as_uri(self, *args, **kwargs) -> "NoReturn": # noqa: D102 raise NotImplementedError
[docs]class URL(os.PathLike): r""" :mod:`pathlib`-like class for URLs. :param url: The URL to construct the :class:`~apeye.url.URL` object from. .. versionchanged:: 0.3.0 The ``url`` parameter can now be a string or a :class:`~.URL`. .. versionchanged:: 1.1.0 Added support for sorting and rich comparisons (``<``, ``<=``, ``>`` and ``>=``). .. autoclasssumm:: URL :autosummary-sections: Methods .. autosummary-widths:: 1/5 .. autoclasssumm:: URL :autosummary-sections: Attributes """ #: URL scheme specifier scheme: str #: Network location part of the URL netloc: str #: The hierarchical path of the URL path: URLPath query: Dict[str, List[str]] """ The query parameters of the URL, if present. .. versionadded:: 0.7.0 """ fragment: Optional[str] """ The URL fragment, used to identify a part of the document. :py:obj:`None` if absent from the URL. .. versionadded:: 0.7.0 """ def __init__(self, url: Union[str, "URL"] = ''): if isinstance(url, URL): url = str(url) if not re.match("([A-Za-z-.]+:)?//", url): url = "//" + str(url) scheme, netloc, parts, params, query, fragment = urlparse(url) self.scheme: str = scheme self.netloc: str = netloc self.path = URLPath(parts) self.query = parse_qs(query or '') self.fragment = fragment or None @property def port(self) -> Optional[int]: """ The port of number of the URL as an integer, if present. Default :py:obj:`None`. .. versionadded:: 0.7.0 """ if ':' not in self.netloc: return None else: return int(self.netloc.split(':')[-1])
[docs] @classmethod def from_parts( cls: Type[URLType], scheme: str, netloc: str, path: PathLike, query: Optional[Mapping[Any, List]] = None, fragment: Optional[str] = None, ) -> URLType: """ Construct a :class:`~apeye.url.URL` from a scheme, netloc and path. :param scheme: The scheme of the URL, e.g ``'http'``. :param netloc: The netloc of the URl, e.g. ``'bbc.co.uk:80'``. :param path: The path of the URL, e.g. ``'/news'``. :param query: The query parameters of the URL, if present. :param fragment: The URL fragment, used to identify a part of the document. :py:obj:`None` if absent from the URL. Put together, the resulting path would be ``'http://bbc.co.uk:80/news'`` :rtype: .. versionchanged:: 0.7.0 Added the ``query`` and ``fragment`` arguments. """ obj = cls('') obj.scheme = scheme obj.netloc = netloc obj.query = dict(query or {}) obj.fragment = fragment or None path = URLPath(path) if path.root == '/': obj.path = path else: obj.path = URLPath('/' + str(path)) return obj
[docs] def __str__(self) -> str: """ Returns the :class:`~apeye.url.URL` as a string. """ query = urlencode(self.query, doseq=True) url = urlunparse([self.scheme, self.netloc, str(self.path), None, query, self.fragment]) if url.startswith("//"): return url[2:] else: return url
[docs] def __repr__(self) -> str: """ Returns the string representation of the :class:`~apeye.url.URL`. """ return f"{self.__class__.__name__}({str(self)!r})"
[docs] def __truediv__(self: URLType, key: Union[PathLike, int]) -> URLType: """ Construct a new :class:`~apeye.url.URL` object for the given child of this :class:`~apeye.url.URL`. :rtype: .. versionchanged:: 0.7.0 * Added support for division by integers. * Now officially supports the new path having a URL fragment and/or query parameters. Any URL fragment or query parameters from the parent URL are not inherited by its children. """ try: return self._make_child((key, )) except TypeError: return NotImplemented
def _make_child(self: URLType, args: Iterable[Union[PathLike, int]]) -> URLType: """ Construct a new :class:`~apeye.url.URL` object by combining the given arguments with this instance's path part. .. versionadded:: 1.1.0 (private) Except for the final path element any queries and fragments are ignored. :returns: A new :class:`~.URL` representing either a subpath (if all arguments are relative paths) or a totally different path (if one of the arguments is absolute). """ parsed_args: List[ParseResult] = [] for arg in args: raw_arg = arg if isinstance(arg, pathlib.PurePath): arg = arg.as_posix() elif isinstance(arg, os.PathLike): arg = os.fspath(arg) elif isinstance(arg, int): arg = str(arg) try: parse_result = urlparse(arg) except AttributeError as e: if str(e).endswith("'decode'"): msg = f"Cannot join {type(raw_arg).__name__!r} to a {type(self.path).__name__!r}" raise TypeError(msg) from None else: raise parsed_args.append(parse_result) try: new_path = self.from_parts( self.scheme, self.netloc, self.path.joinpath(*map(attrgetter("path"), parsed_args)), ) except TypeError: return NotImplemented if parsed_args: new_path.query = parse_qs(parsed_args[-1].query) new_path.fragment = parsed_args[-1].fragment or None return new_path
[docs] def joinurl(self: URLType, *args) -> URLType: """ Construct a new :class:`~apeye.url.URL` object by combining the given arguments with this instance's path part. .. versionadded:: 1.1.0 Except for the final path element any queries and fragments are ignored. :returns: A new :class:`~.URL` representing either a subpath (if all arguments are relative paths) or a totally different path (if one of the arguments is absolute). """ return self._make_child(args)
[docs] def __fspath__(self) -> str: """ Returns the file system path representation of the :class:`~.URL`. This is comprised of the ``netloc`` and ``path`` attributes. """ return f"{self.netloc}{self.path}"
[docs] def __eq__(self, other) -> bool: """ Return ``self == other``. .. latex:vspace:: -10px .. attention:: URL fragments and query parameters are not compared. .. seealso:: :meth:`.URL.strict_compare`, which *does* consider those attributes. .. latex:vspace:: -20px """ if isinstance(other, URL): return self.netloc == other.netloc and self.scheme == other.scheme and self.path == other.path else: return NotImplemented
def __lt__(self, other): if isinstance(other, URL): return self._parts_port < other._parts_port else: return NotImplemented def __le__(self, other): if isinstance(other, URL): return self._parts_port <= other._parts_port else: return NotImplemented def __gt__(self, other): if isinstance(other, URL): return self._parts_port > other._parts_port else: return NotImplemented def __ge__(self, other): if isinstance(other, URL): return self._parts_port >= other._parts_port else: return NotImplemented
[docs] def strict_compare(self, other) -> bool: """ Return ``self ≡ other``, comparing the scheme, netloc, path, fragment and query parameters. .. versionadded:: 0.7.0 """ if isinstance(other, URL): return ( self.netloc == other.netloc and self.scheme == other.scheme and self.path == other.path and self.query == other.query and self.fragment == other.fragment ) else: return NotImplemented
def __hash__(self) -> int: """ Returns the has of the :class:`~apeye.url.URL` . """ return hash((self.scheme, self.netloc, self.path)) @property def name(self) -> str: """ The final path component, if any. """ return self.path.name @property def suffix(self) -> str: """ The final component's last suffix, if any. This includes the leading period. For example: ``'.txt'``. """ return self.path.suffix @property def suffixes(self) -> List[str]: """ A list of the final component's suffixes, if any. These include the leading periods. For example: ``['.tar', '.gz']``. """ return self.path.suffixes @property def stem(self): """ The final path component, minus its last suffix. """ return self.path.stem
[docs] def with_name(self: URLType, name: str, inherit: bool = True) -> URLType: """ Return a new :class:`~apeye.url.URL` with the file name changed. :param name: :param inherit: Whether the new :class:`~apeye.url.URL` should inherit the query string and fragment from this :class:`~apeye.url.URL`. :rtype: .. versionchanged:: 0.7.0 Added the ``inherit`` parameter. """ if inherit: kwargs = {"query": self.query, "fragment": self.fragment} else: kwargs = {} return self.from_parts( self.scheme, self.netloc, self.path.with_name(name), **kwargs, # type: ignore )
[docs] def with_suffix(self: URLType, suffix: str, inherit: bool = True) -> URLType: """ Returns a new :class:`~apeye.url.URL` with the file suffix changed. If the :class:`~apeye.url.URL` has no suffix, add the given suffix. If the given suffix is an empty string, remove the suffix from the :class:`~apeye.url.URL`. :param suffix: :param inherit: Whether the new :class:`~apeye.url.URL` should inherit the query string and fragment from this :class:`~apeye.url.URL`. :rtype: .. versionchanged:: 0.7.0 Added the ``inherit`` parameter. """ if inherit: kwargs = {"query": self.query, "fragment": self.fragment} else: kwargs = {} return self.from_parts( self.scheme, self.netloc, self.path.with_suffix(suffix), **kwargs, # type: ignore )
@property def parts(self) -> Tuple[str, ...]: """ An object providing sequence-like access to the components in the URL. To retrieve only the parts of the path, use :meth:`URL.path.parts <URLPath.parts>`. """ return ( self.scheme, self.domain.subdomain, self.domain.domain, self.domain.suffix, *('/' / self.path).parts[1:], ) @property def _parts_port(self) -> Tuple: """ An object providing sequence-like access to the components in the URL. Unlike ``.parts`` this includes the port. To retrieve only the parts of the path, use :meth:`URL.path.parts <URLPath.parts>`. .. versionadded:: 1.1.0 (private) """ return ( self.scheme, self.domain.subdomain, self.domain.domain, self.domain.suffix, self.port or 0, *('/' / self.path).parts[1:], ) @property def parent(self: URLType) -> URLType: """ The logical parent of the :class:`~apeye.url.URL`. """ return self.from_parts(self.scheme, self.netloc, self.path.parent) @property def parents(self: URLType) -> Tuple[URLType, ...]: """ An immutable sequence providing access to the logical ancestors of the :class:`~apeye.url.URL`. """ return tuple(self.from_parts(self.scheme, self.netloc, path) for path in self.path.parents) @property def fqdn(self) -> str: """ Returns the Fully Qualified Domain Name of the :class:`~apeye.url.URL` . """ return self.domain.fqdn @property def domain(self) -> "Domain": """ Returns a :class:`apeye.url.Domain` object representing the domain part of the URL. """ return Domain._make(_tld.extract_tld(self.netloc)) @property def base_url(self: URLType) -> URLType: """ Returns a :class:`apeye.url.URL` object representing the URL without query strings or URL fragments. .. versionadded:: 0.7.0 """ return self.from_parts( self.scheme, self.netloc, self.path, )
[docs] def relative_to(self, other: Union[str, "URL", URLPath]) -> URLPath: """ Returns a version of this URL's path relative to ``other``. .. versionadded:: 1.1.0 :param other: Either a :class:`~.URL`, or a string or :class:`~.URLPath` representing an *absolute* path. If a :class:`~.URL`, the :attr:`~.URL.netloc` must match this URL's. :raises ValueError: if the operation is not possible (i.e. because this URL's path is not a subpath of the other path) """ if isinstance(other, URLPath): if not other.is_absolute(): raise ValueError("'URL.relative_to' cannot be used with relative URLPath objects") else: other = URL('/') / other elif not isinstance(other, URL): # Parse other as a URL other = URL(other) # Compare netloc, if both have one if self.netloc and other.netloc and self.netloc.lower() != other.netloc.lower(): raise ValueError(f"{self!r} does not start with {other!r}") # Make the paths absolute # If coming from a URL they must always be absolute our_path = '/' / self.path other_path = '/' / other.path relative_path = our_path.relative_to(other_path) return relative_path
[docs]class Domain(NamedTuple): """ :class:`typing.NamedTuple` of a URL's subdomain, domain, and suffix. """ subdomain: str domain: str suffix: str @property def registered_domain(self): """ Joins the domain and suffix fields with a dot, if they're both set. .. code-block:: python >>> URL('https://forums.bbc.co.uk').domain.registered_domain 'bbc.co.uk' >>> URL('https://localhost:8080').domain.registered_domain '' """ if self.domain and self.suffix: return self.domain + '.' + self.suffix return '' @property def fqdn(self): """ Returns a Fully Qualified Domain Name, if there is a proper domain/suffix. .. code-block:: python >>> URL('https://forums.bbc.co.uk/path/to/file').domain.fqdn 'forums.bbc.co.uk' >>> URL('https://localhost:8080').domain.fqdn '' """ if self.domain and self.suffix: # self is the namedtuple (subdomain domain suffix) return '.'.join(i for i in self if i) return '' @property def ipv4(self) -> Optional[ipaddress.IPv4Address]: """ Returns the ipv4 if that is what the presented domain/url is. .. code-block:: python >>> URL('https://127.0.0.1/path/to/file').domain.ipv4 IPv4Address('127.0.0.1') >>> URL('https://127.0.0.1.1/path/to/file').domain.ipv4 >>> URL('https://256.1.1.1').domain.ipv4 """ if not (self.suffix or self.subdomain) and _tld.IP_RE.match(self.domain): return ipaddress.ip_address(self.domain) return None
[docs] def __repr__(self) -> str: """ Return a string representation of the :class:`~.Domain`. """ # This is necessary to get the custom docstring repr_fmt = f"({', '.join(f'{name}=%r' for name in self._fields)})" return f"{self.__class__.__name__}{repr_fmt % self}"