Initial commit

This commit is contained in:
2025-10-29 14:37:08 +08:00
commit 034509181f
4131 changed files with 555736 additions and 0 deletions

View File

@@ -0,0 +1,38 @@
# flake8: noqa: F401
import warnings
from .common import (
HEADRequest,
PATCHRequest,
PUTRequest,
Request,
RequestDirector,
RequestHandler,
Response,
)
# isort: split
# TODO: all request handlers should be safely imported
from . import _urllib
from ..utils import bug_reports_message
try:
from . import _requests
except ImportError:
pass
except Exception as e:
warnings.warn(f'Failed to import "requests" request handler: {e}' + bug_reports_message())
try:
from . import _websockets
except ImportError:
pass
except Exception as e:
warnings.warn(f'Failed to import "websockets" request handler: {e}' + bug_reports_message())
try:
from . import _curlcffi
except ImportError:
pass
except Exception as e:
warnings.warn(f'Failed to import "curl_cffi" request handler: {e}' + bug_reports_message())

View File

@@ -0,0 +1,321 @@
from __future__ import annotations
import io
import itertools
import math
import re
import urllib.parse
from ._helper import InstanceStoreMixin
from ..utils.networking import select_proxy
from .common import (
Features,
Request,
Response,
register_preference,
register_rh,
)
from .exceptions import (
CertificateVerifyError,
HTTPError,
IncompleteRead,
ProxyError,
SSLError,
TransportError,
)
from .impersonate import ImpersonateRequestHandler, ImpersonateTarget
from ..dependencies import curl_cffi, certifi
from ..utils import int_or_none
if curl_cffi is None:
raise ImportError('curl_cffi is not installed')
curl_cffi_version = tuple(map(int, re.split(r'[^\d]+', curl_cffi.__version__)[:3]))
if curl_cffi_version != (0, 5, 10) and not (0, 10) <= curl_cffi_version < (0, 14):
curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)'
raise ImportError('Only curl_cffi versions 0.5.10, 0.10.x, 0.11.x, 0.12.x, 0.13.x are supported')
import curl_cffi.requests
from curl_cffi.const import CurlECode, CurlOpt
class CurlCFFIResponseReader(io.IOBase):
def __init__(self, response: curl_cffi.requests.Response):
self._response = response
self._iterator = response.iter_content()
self._buffer = b''
self.bytes_read = 0
def readable(self):
return True
def read(self, size=None):
exception_raised = True
try:
while self._iterator and (size is None or len(self._buffer) < size):
chunk = next(self._iterator, None)
if chunk is None:
self._iterator = None
break
self._buffer += chunk
self.bytes_read += len(chunk)
if size is None:
size = len(self._buffer)
data = self._buffer[:size]
self._buffer = self._buffer[size:]
# "free" the curl instance if the response is fully read.
# curl_cffi doesn't do this automatically and only allows one open response per thread
if not self._iterator and not self._buffer:
self.close()
exception_raised = False
return data
finally:
if exception_raised:
self.close()
def close(self):
if not self.closed:
self._response.close()
self._buffer = b''
super().close()
class CurlCFFIResponseAdapter(Response):
fp: CurlCFFIResponseReader
def __init__(self, response: curl_cffi.requests.Response):
super().__init__(
fp=CurlCFFIResponseReader(response),
headers=response.headers,
url=response.url,
status=response.status_code)
def read(self, amt=None):
try:
return self.fp.read(amt)
except curl_cffi.requests.errors.RequestsError as e:
if e.code == CurlECode.PARTIAL_FILE:
content_length = e.response and int_or_none(e.response.headers.get('Content-Length'))
raise IncompleteRead(
partial=self.fp.bytes_read,
expected=content_length - self.fp.bytes_read if content_length is not None else None,
cause=e) from e
raise TransportError(cause=e) from e
# See: https://github.com/lexiforest/curl_cffi?tab=readme-ov-file#supported-impersonate-browsers
# https://github.com/lexiforest/curl-impersonate?tab=readme-ov-file#supported-browsers
BROWSER_TARGETS: dict[tuple[int, ...], dict[str, ImpersonateTarget]] = {
(0, 5): {
'chrome99': ImpersonateTarget('chrome', '99', 'windows', '10'),
'chrome99_android': ImpersonateTarget('chrome', '99', 'android', '12'),
'chrome100': ImpersonateTarget('chrome', '100', 'windows', '10'),
'chrome101': ImpersonateTarget('chrome', '101', 'windows', '10'),
'chrome104': ImpersonateTarget('chrome', '104', 'windows', '10'),
'chrome107': ImpersonateTarget('chrome', '107', 'windows', '10'),
'chrome110': ImpersonateTarget('chrome', '110', 'windows', '10'),
'edge99': ImpersonateTarget('edge', '99', 'windows', '10'),
'edge101': ImpersonateTarget('edge', '101', 'windows', '10'),
'safari153': ImpersonateTarget('safari', '15.3', 'macos', '11'),
'safari155': ImpersonateTarget('safari', '15.5', 'macos', '12'),
},
(0, 7): {
'chrome116': ImpersonateTarget('chrome', '116', 'windows', '10'),
'chrome119': ImpersonateTarget('chrome', '119', 'macos', '14'),
'chrome120': ImpersonateTarget('chrome', '120', 'macos', '14'),
'chrome123': ImpersonateTarget('chrome', '123', 'macos', '14'),
'chrome124': ImpersonateTarget('chrome', '124', 'macos', '14'),
'safari170': ImpersonateTarget('safari', '17.0', 'macos', '14'),
'safari172_ios': ImpersonateTarget('safari', '17.2', 'ios', '17.2'),
},
(0, 9): {
'safari153': ImpersonateTarget('safari', '15.3', 'macos', '14'),
'safari155': ImpersonateTarget('safari', '15.5', 'macos', '14'),
'chrome119': ImpersonateTarget('chrome', '119', 'macos', '14'),
'chrome120': ImpersonateTarget('chrome', '120', 'macos', '14'),
'chrome123': ImpersonateTarget('chrome', '123', 'macos', '14'),
'chrome124': ImpersonateTarget('chrome', '124', 'macos', '14'),
'chrome131': ImpersonateTarget('chrome', '131', 'macos', '14'),
'chrome131_android': ImpersonateTarget('chrome', '131', 'android', '14'),
'chrome133a': ImpersonateTarget('chrome', '133', 'macos', '15'),
'firefox133': ImpersonateTarget('firefox', '133', 'macos', '14'),
'safari180': ImpersonateTarget('safari', '18.0', 'macos', '15'),
'safari180_ios': ImpersonateTarget('safari', '18.0', 'ios', '18.0'),
},
(0, 10): {
'firefox135': ImpersonateTarget('firefox', '135', 'macos', '14'),
},
(0, 11): {
'tor145': ImpersonateTarget('tor', '14.5', 'macos', '14'),
'safari184': ImpersonateTarget('safari', '18.4', 'macos', '15'),
'safari184_ios': ImpersonateTarget('safari', '18.4', 'ios', '18.4'),
'chrome136': ImpersonateTarget('chrome', '136', 'macos', '15'),
},
(0, 12): {
'safari260': ImpersonateTarget('safari', '26.0', 'macos', '26'),
'safari260_ios': ImpersonateTarget('safari', '26.0', 'ios', '26.0'),
},
}
# Needed for curl_cffi < 0.11
# See: https://github.com/lexiforest/curl_cffi/commit/d2f15c7a31506a08d217fcc04ae7570c39f5f5bb
_TARGETS_COMPAT_LOOKUP = {
'safari153': 'safari15_3',
'safari155': 'safari15_5',
'safari170': 'safari17_0',
'safari172_ios': 'safari17_2_ios',
'safari180': 'safari18_0',
'safari180_ios': 'safari18_0_ios',
}
@register_rh
class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
RH_NAME = 'curl_cffi'
_SUPPORTED_URL_SCHEMES = ('http', 'https')
_SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
_SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
_SUPPORTED_IMPERSONATE_TARGET_MAP = {
target: (
name if curl_cffi_version >= (0, 11)
else _TARGETS_COMPAT_LOOKUP.get(name, name) if curl_cffi_version >= (0, 9)
else curl_cffi.requests.BrowserType[_TARGETS_COMPAT_LOOKUP.get(name, name)]
) for name, target in dict(sorted(itertools.chain.from_iterable(
targets.items()
for version, targets in BROWSER_TARGETS.items()
if curl_cffi_version >= version
), key=lambda x: (
# deprioritize mobile targets since they give very different behavior
x[1].os not in ('ios', 'android'),
# prioritize tor < edge < firefox < safari < chrome
('tor', 'edge', 'firefox', 'safari', 'chrome').index(x[1].client),
# prioritize newest version
float(x[1].version) if x[1].version else 0,
# group by os name
x[1].os,
), reverse=True)).items()
}
def _create_instance(self, cookiejar=None):
return curl_cffi.requests.Session(cookies=cookiejar)
def _check_extensions(self, extensions):
super()._check_extensions(extensions)
extensions.pop('impersonate', None)
extensions.pop('cookiejar', None)
extensions.pop('timeout', None)
# CurlCFFIRH ignores legacy ssl options currently.
# Impersonation generally uses a looser SSL configuration than urllib/requests.
extensions.pop('legacy_ssl', None)
def send(self, request: Request) -> Response:
target = self._get_request_target(request)
try:
response = super().send(request)
except HTTPError as e:
e.response.extensions['impersonate'] = target
raise
response.extensions['impersonate'] = target
return response
def _send(self, request: Request):
max_redirects_exceeded = False
session: curl_cffi.requests.Session = self._get_instance(
cookiejar=self._get_cookiejar(request) if 'cookie' not in request.headers else None)
if self.verbose:
session.curl.setopt(CurlOpt.VERBOSE, 1)
proxies = self._get_proxies(request)
if 'no' in proxies:
session.curl.setopt(CurlOpt.NOPROXY, proxies['no'])
proxies.pop('no', None)
# curl doesn't support per protocol proxies, so we select the one that matches the request protocol
proxy = select_proxy(request.url, proxies=proxies)
if proxy:
session.curl.setopt(CurlOpt.PROXY, proxy)
scheme = urllib.parse.urlparse(request.url).scheme.lower()
if scheme != 'http':
# Enable HTTP CONNECT for HTTPS urls.
# Don't use CONNECT for http for compatibility with urllib behaviour.
# See: https://curl.se/libcurl/c/CURLOPT_HTTPPROXYTUNNEL.html
session.curl.setopt(CurlOpt.HTTPPROXYTUNNEL, 1)
# curl_cffi does not currently set these for proxies
session.curl.setopt(CurlOpt.PROXY_CAINFO, certifi.where())
if not self.verify:
session.curl.setopt(CurlOpt.PROXY_SSL_VERIFYPEER, 0)
session.curl.setopt(CurlOpt.PROXY_SSL_VERIFYHOST, 0)
headers = self._get_impersonate_headers(request)
if self._client_cert:
session.curl.setopt(CurlOpt.SSLCERT, self._client_cert['client_certificate'])
client_certificate_key = self._client_cert.get('client_certificate_key')
client_certificate_password = self._client_cert.get('client_certificate_password')
if client_certificate_key:
session.curl.setopt(CurlOpt.SSLKEY, client_certificate_key)
if client_certificate_password:
session.curl.setopt(CurlOpt.KEYPASSWD, client_certificate_password)
timeout = self._calculate_timeout(request)
# set CURLOPT_LOW_SPEED_LIMIT and CURLOPT_LOW_SPEED_TIME to act as a read timeout. [1]
# This is required only for 0.5.10 [2]
# Note: CURLOPT_LOW_SPEED_TIME is in seconds, so we need to round up to the nearest second. [3]
# [1] https://unix.stackexchange.com/a/305311
# [2] https://github.com/yifeikong/curl_cffi/issues/156
# [3] https://curl.se/libcurl/c/CURLOPT_LOW_SPEED_TIME.html
session.curl.setopt(CurlOpt.LOW_SPEED_LIMIT, 1) # 1 byte per second
session.curl.setopt(CurlOpt.LOW_SPEED_TIME, math.ceil(timeout))
try:
curl_response = session.request(
method=request.method,
url=request.url,
headers=headers,
data=request.data,
verify=self.verify,
max_redirects=5,
timeout=(timeout, timeout),
impersonate=self._SUPPORTED_IMPERSONATE_TARGET_MAP.get(
self._get_request_target(request)),
interface=self.source_address,
stream=True,
)
except curl_cffi.requests.errors.RequestsError as e:
if e.code == CurlECode.PEER_FAILED_VERIFICATION:
raise CertificateVerifyError(cause=e) from e
elif e.code == CurlECode.SSL_CONNECT_ERROR:
raise SSLError(cause=e) from e
elif e.code == CurlECode.TOO_MANY_REDIRECTS:
max_redirects_exceeded = True
curl_response = e.response
elif (
e.code == CurlECode.PROXY
or (e.code == CurlECode.RECV_ERROR and 'CONNECT' in str(e))
):
raise ProxyError(cause=e) from e
else:
raise TransportError(cause=e) from e
response = CurlCFFIResponseAdapter(curl_response)
if not 200 <= response.status < 300:
raise HTTPError(response, redirect_loop=max_redirects_exceeded)
return response
@register_preference(CurlCFFIRH)
def curl_cffi_preference(rh, request):
return -100

View File

@@ -0,0 +1,273 @@
from __future__ import annotations
import contextlib
import functools
import os
import socket
import ssl
import sys
import typing
import urllib.parse
import urllib.request
from .exceptions import RequestError
from ..dependencies import certifi
from ..socks import ProxyType, sockssocket
if typing.TYPE_CHECKING:
from collections.abc import Iterable
from ..utils.networking import HTTPHeaderDict
def ssl_load_certs(context: ssl.SSLContext, use_certifi=True):
if certifi and use_certifi:
context.load_verify_locations(cafile=certifi.where())
else:
try:
context.load_default_certs()
# Work around the issue in load_default_certs when there are bad certificates. See:
# https://github.com/yt-dlp/yt-dlp/issues/1060,
# https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
except ssl.SSLError:
# enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
for storename in ('CA', 'ROOT'):
ssl_load_windows_store_certs(context, storename)
context.set_default_verify_paths()
def ssl_load_windows_store_certs(ssl_context, storename):
# Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
try:
certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
if encoding == 'x509_asn' and (
trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
except PermissionError:
return
for cert in certs:
with contextlib.suppress(ssl.SSLError):
ssl_context.load_verify_locations(cadata=cert)
def make_socks_proxy_opts(socks_proxy):
url_components = urllib.parse.urlparse(socks_proxy)
if url_components.scheme.lower() == 'socks5':
socks_type = ProxyType.SOCKS5
rdns = False
elif url_components.scheme.lower() == 'socks5h':
socks_type = ProxyType.SOCKS5
rdns = True
elif url_components.scheme.lower() == 'socks4':
socks_type = ProxyType.SOCKS4
rdns = False
elif url_components.scheme.lower() == 'socks4a':
socks_type = ProxyType.SOCKS4A
rdns = True
else:
raise ValueError(f'Unknown SOCKS proxy version: {url_components.scheme.lower()}')
def unquote_if_non_empty(s):
if not s:
return s
return urllib.parse.unquote_plus(s)
return {
'proxytype': socks_type,
'addr': url_components.hostname,
'port': url_components.port or 1080,
'rdns': rdns,
'username': unquote_if_non_empty(url_components.username),
'password': unquote_if_non_empty(url_components.password),
}
def get_redirect_method(method, status):
"""Unified redirect method handling"""
# A 303 must either use GET or HEAD for subsequent request
# https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
if status == 303 and method != 'HEAD':
method = 'GET'
# 301 and 302 redirects are commonly turned into a GET from a POST
# for subsequent requests by browsers, so we'll do the same.
# https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
# https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
if status in (301, 302) and method == 'POST':
method = 'GET'
return method
def make_ssl_context(
verify=True,
client_certificate=None,
client_certificate_key=None,
client_certificate_password=None,
legacy_support=False,
use_certifi=True,
):
context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
context.check_hostname = verify
context.verify_mode = ssl.CERT_REQUIRED if verify else ssl.CERT_NONE
# OpenSSL 1.1.1+ Python 3.8+ keylog file
if hasattr(context, 'keylog_filename'):
context.keylog_filename = os.environ.get('SSLKEYLOGFILE') or None
# Some servers may reject requests if ALPN extension is not sent. See:
# https://github.com/python/cpython/issues/85140
# https://github.com/yt-dlp/yt-dlp/issues/3878
with contextlib.suppress(NotImplementedError):
context.set_alpn_protocols(['http/1.1'])
if verify:
ssl_load_certs(context, use_certifi)
if legacy_support:
context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
context.set_ciphers('DEFAULT') # compat
elif ssl.OPENSSL_VERSION_INFO >= (1, 1, 1) and not ssl.OPENSSL_VERSION.startswith('LibreSSL'):
# Use the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
# This is to ensure consistent behavior across Python versions and libraries, and help avoid fingerprinting
# in some situations [2][3].
# Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
# untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
# LibreSSL is excluded until further investigation due to cipher support issues [5][6].
# 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
# 2. https://github.com/yt-dlp/yt-dlp/issues/4627
# 3. https://github.com/yt-dlp/yt-dlp/pull/5294
# 4. https://peps.python.org/pep-0644/
# 5. https://peps.python.org/pep-0644/#libressl-support
# 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
context.set_ciphers(
'@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
context.minimum_version = ssl.TLSVersion.TLSv1_2
if client_certificate:
try:
context.load_cert_chain(
client_certificate, keyfile=client_certificate_key,
password=client_certificate_password)
except ssl.SSLError:
raise RequestError('Unable to load client certificate')
if getattr(context, 'post_handshake_auth', None) is not None:
context.post_handshake_auth = True
return context
class InstanceStoreMixin:
def __init__(self, **kwargs):
self.__instances = []
super().__init__(**kwargs) # So that both MRO works
@staticmethod
def _create_instance(**kwargs):
raise NotImplementedError
def _get_instance(self, **kwargs):
for key, instance in self.__instances:
if key == kwargs:
return instance
instance = self._create_instance(**kwargs)
self.__instances.append((kwargs, instance))
return instance
def _close_instance(self, instance):
if callable(getattr(instance, 'close', None)):
instance.close()
def _clear_instances(self):
for _, instance in self.__instances:
self._close_instance(instance)
self.__instances.clear()
def add_accept_encoding_header(headers: HTTPHeaderDict, supported_encodings: Iterable[str]):
if 'Accept-Encoding' not in headers:
headers['Accept-Encoding'] = ', '.join(supported_encodings) or 'identity'
def wrap_request_errors(func):
@functools.wraps(func)
def wrapper(self, *args, **kwargs):
try:
return func(self, *args, **kwargs)
except RequestError as e:
if e.handler is None:
e.handler = self
raise
return wrapper
def _socket_connect(ip_addr, timeout, source_address):
af, socktype, proto, _canonname, sa = ip_addr
sock = socket.socket(af, socktype, proto)
try:
if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
sock.settimeout(timeout)
if source_address:
sock.bind(source_address)
sock.connect(sa)
return sock
except OSError:
sock.close()
raise
def create_socks_proxy_socket(dest_addr, proxy_args, proxy_ip_addr, timeout, source_address):
af, socktype, proto, _canonname, sa = proxy_ip_addr
sock = sockssocket(af, socktype, proto)
try:
connect_proxy_args = proxy_args.copy()
connect_proxy_args.update({'addr': sa[0], 'port': sa[1]})
sock.setproxy(**connect_proxy_args)
if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
sock.settimeout(timeout)
if source_address:
sock.bind(source_address)
sock.connect(dest_addr)
return sock
except OSError:
sock.close()
raise
def create_connection(
address,
timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
source_address=None,
*,
_create_socket_func=_socket_connect,
):
# Work around socket.create_connection() which tries all addresses from getaddrinfo() including IPv6.
# This filters the addresses based on the given source_address.
# Based on: https://github.com/python/cpython/blob/main/Lib/socket.py#L810
host, port = address
ip_addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
if not ip_addrs:
raise OSError('getaddrinfo returns an empty list')
if source_address is not None:
af = socket.AF_INET if ':' not in source_address[0] else socket.AF_INET6
ip_addrs = [addr for addr in ip_addrs if addr[0] == af]
if not ip_addrs:
raise OSError(
f'No remote IPv{4 if af == socket.AF_INET else 6} addresses available for connect. '
f'Can\'t use "{source_address[0]}" as source address')
err = None
for ip_addr in ip_addrs:
try:
sock = _create_socket_func(ip_addr, timeout, source_address)
# Explicitly break __traceback__ reference cycle
# https://bugs.python.org/issue36820
err = None
return sock
except OSError as e:
err = e
try:
raise err
finally:
# Explicitly break __traceback__ reference cycle
# https://bugs.python.org/issue36820
err = None

View File

@@ -0,0 +1,413 @@
from __future__ import annotations
import functools
import http.client
import logging
import re
import warnings
from ..dependencies import brotli, requests, urllib3
from ..utils import bug_reports_message, int_or_none, variadic
from ..utils.networking import normalize_url, select_proxy
if requests is None:
raise ImportError('requests module is not installed')
if urllib3 is None:
raise ImportError('urllib3 module is not installed')
urllib3_version = tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.'))
if urllib3_version < (2, 0, 2):
urllib3._yt_dlp__version = f'{urllib3.__version__} (unsupported)'
raise ImportError('Only urllib3 >= 2.0.2 is supported')
if requests.__build__ < 0x023202:
requests._yt_dlp__version = f'{requests.__version__} (unsupported)'
raise ImportError('Only requests >= 2.32.2 is supported')
import requests.adapters
import requests.utils
import urllib3.connection
import urllib3.exceptions
import urllib3.util
from ._helper import (
InstanceStoreMixin,
add_accept_encoding_header,
create_connection,
create_socks_proxy_socket,
get_redirect_method,
make_socks_proxy_opts,
)
from .common import (
Features,
RequestHandler,
Response,
register_preference,
register_rh,
)
from .exceptions import (
CertificateVerifyError,
HTTPError,
IncompleteRead,
ProxyError,
RequestError,
SSLError,
TransportError,
)
from ..socks import ProxyError as SocksProxyError
SUPPORTED_ENCODINGS = [
'gzip', 'deflate',
]
if brotli is not None:
SUPPORTED_ENCODINGS.append('br')
'''
Override urllib3's behavior to not convert lower-case percent-encoded characters
to upper-case during url normalization process.
RFC3986 defines that the lower or upper case percent-encoded hexidecimal characters are equivalent
and normalizers should convert them to uppercase for consistency [1].
However, some sites may have an incorrect implementation where they provide
a percent-encoded url that is then compared case-sensitively.[2]
While this is a very rare case, since urllib does not do this normalization step, it
is best to avoid it in requests too for compatability reasons.
1: https://tools.ietf.org/html/rfc3986#section-2.1
2: https://github.com/streamlink/streamlink/pull/4003
'''
class Urllib3PercentREOverride:
def __init__(self, r: re.Pattern):
self.re = r
# pass through all other attribute calls to the original re
def __getattr__(self, item):
return self.re.__getattribute__(item)
def subn(self, repl, string, *args, **kwargs):
return string, self.re.subn(repl, string, *args, **kwargs)[1]
# urllib3 >= 1.25.8 uses subn:
# https://github.com/urllib3/urllib3/commit/a2697e7c6b275f05879b60f593c5854a816489f0
import urllib3.util.url
if hasattr(urllib3.util.url, '_PERCENT_RE'): # was 'PERCENT_RE' in urllib3 < 2.0.0
urllib3.util.url._PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url._PERCENT_RE)
else:
warnings.warn('Failed to patch _PERCENT_RE in urllib3 (does the attribute exist?)' + bug_reports_message())
# Requests will not automatically handle no_proxy by default
# due to buggy no_proxy handling with proxy dict [1].
# 1. https://github.com/psf/requests/issues/5000
requests.adapters.select_proxy = select_proxy
class RequestsResponseAdapter(Response):
def __init__(self, res: requests.models.Response):
super().__init__(
fp=res.raw, headers=res.headers, url=res.url,
status=res.status_code, reason=res.reason)
self._requests_response = res
def read(self, amt: int | None = None):
try:
# Work around issue with `.read(amt)` then `.read()`
# See: https://github.com/urllib3/urllib3/issues/3636
if amt is None:
# compat: py3.9: Python 3.9 preallocates the whole read buffer, read in chunks
read_chunk = functools.partial(self.fp.read, 1 << 20, decode_content=True)
return b''.join(iter(read_chunk, b''))
# Interact with urllib3 response directly.
return self.fp.read(amt, decode_content=True)
# See urllib3.response.HTTPResponse.read() for exceptions raised on read
except urllib3.exceptions.SSLError as e:
raise SSLError(cause=e) from e
except urllib3.exceptions.ProtocolError as e:
# IncompleteRead is always contained within ProtocolError
# See urllib3.response.HTTPResponse._error_catcher()
ir_err = next(
(err for err in (e.__context__, e.__cause__, *variadic(e.args))
if isinstance(err, http.client.IncompleteRead)), None)
if ir_err is not None:
# `urllib3.exceptions.IncompleteRead` is subclass of `http.client.IncompleteRead`
# but uses an `int` for its `partial` property.
partial = ir_err.partial if isinstance(ir_err.partial, int) else len(ir_err.partial)
raise IncompleteRead(partial=partial, expected=ir_err.expected) from e
raise TransportError(cause=e) from e
except urllib3.exceptions.HTTPError as e:
# catch-all for any other urllib3 response exceptions
raise TransportError(cause=e) from e
class RequestsHTTPAdapter(requests.adapters.HTTPAdapter):
def __init__(self, ssl_context=None, proxy_ssl_context=None, source_address=None, **kwargs):
self._pm_args = {}
if ssl_context:
self._pm_args['ssl_context'] = ssl_context
if source_address:
self._pm_args['source_address'] = (source_address, 0)
self._proxy_ssl_context = proxy_ssl_context or ssl_context
super().__init__(**kwargs)
def init_poolmanager(self, *args, **kwargs):
return super().init_poolmanager(*args, **kwargs, **self._pm_args)
def proxy_manager_for(self, proxy, **proxy_kwargs):
extra_kwargs = {}
if not proxy.lower().startswith('socks') and self._proxy_ssl_context:
extra_kwargs['proxy_ssl_context'] = self._proxy_ssl_context
return super().proxy_manager_for(proxy, **proxy_kwargs, **self._pm_args, **extra_kwargs)
# Skip `requests` internal verification; we use our own SSLContext
def cert_verify(*args, **kwargs):
pass
# requests 2.32.2+: Reimplementation without `_urllib3_request_context`
def get_connection_with_tls_context(self, request, verify, proxies=None, cert=None):
url = urllib3.util.parse_url(request.url).url
manager = self.poolmanager
if proxy := select_proxy(url, proxies):
manager = self.proxy_manager_for(proxy)
return manager.connection_from_url(url)
class RequestsSession(requests.sessions.Session):
"""
Ensure unified redirect method handling with our urllib redirect handler.
"""
def rebuild_method(self, prepared_request, response):
new_method = get_redirect_method(prepared_request.method, response.status_code)
# HACK: requests removes headers/body on redirect unless code was a 307/308.
if new_method == prepared_request.method:
response._real_status_code = response.status_code
response.status_code = 308
prepared_request.method = new_method
# Requests fails to resolve dot segments on absolute redirect locations
# See: https://github.com/yt-dlp/yt-dlp/issues/9020
prepared_request.url = normalize_url(prepared_request.url)
def rebuild_auth(self, prepared_request, response):
# HACK: undo status code change from rebuild_method, if applicable.
# rebuild_auth runs after requests would remove headers/body based on status code
if hasattr(response, '_real_status_code'):
response.status_code = response._real_status_code
del response._real_status_code
return super().rebuild_auth(prepared_request, response)
class Urllib3LoggingFilter(logging.Filter):
def filter(self, record):
# Ignore HTTP request messages since HTTPConnection prints those
return record.msg != '%s://%s:%s "%s %s %s" %s %s'
class Urllib3LoggingHandler(logging.Handler):
"""Redirect urllib3 logs to our logger"""
def __init__(self, logger, *args, **kwargs):
super().__init__(*args, **kwargs)
self._logger = logger
def emit(self, record):
try:
msg = self.format(record)
if record.levelno >= logging.ERROR:
self._logger.error(msg)
else:
self._logger.stdout(msg)
except Exception:
self.handleError(record)
@register_rh
class RequestsRH(RequestHandler, InstanceStoreMixin):
"""Requests RequestHandler
https://github.com/psf/requests
"""
_SUPPORTED_URL_SCHEMES = ('http', 'https')
_SUPPORTED_ENCODINGS = tuple(SUPPORTED_ENCODINGS)
_SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
_SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
RH_NAME = 'requests'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Forward urllib3 debug messages to our logger
logger = logging.getLogger('urllib3')
self.__logging_handler = Urllib3LoggingHandler(logger=self._logger)
self.__logging_handler.setFormatter(logging.Formatter('requests: %(message)s'))
self.__logging_handler.addFilter(Urllib3LoggingFilter())
logger.addHandler(self.__logging_handler)
# TODO: Use a logger filter to suppress pool reuse warning instead
logger.setLevel(logging.ERROR)
if self.verbose:
# Setting this globally is not ideal, but is easier than hacking with urllib3.
# It could technically be problematic for scripts embedding yt-dlp.
# However, it is unlikely debug traffic is used in that context in a way this will cause problems.
urllib3.connection.HTTPConnection.debuglevel = 1
logger.setLevel(logging.DEBUG)
# this is expected if we are using --no-check-certificate
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def close(self):
self._clear_instances()
# Remove the logging handler that contains a reference to our logger
# See: https://github.com/yt-dlp/yt-dlp/issues/8922
logging.getLogger('urllib3').removeHandler(self.__logging_handler)
def _check_extensions(self, extensions):
super()._check_extensions(extensions)
extensions.pop('cookiejar', None)
extensions.pop('timeout', None)
extensions.pop('legacy_ssl', None)
extensions.pop('keep_header_casing', None)
def _create_instance(self, cookiejar, legacy_ssl_support=None):
session = RequestsSession()
http_adapter = RequestsHTTPAdapter(
ssl_context=self._make_sslcontext(legacy_ssl_support=legacy_ssl_support),
source_address=self.source_address,
max_retries=urllib3.util.retry.Retry(False),
)
session.adapters.clear()
session.headers = requests.models.CaseInsensitiveDict()
session.mount('https://', http_adapter)
session.mount('http://', http_adapter)
session.cookies = cookiejar
session.trust_env = False # no need, we already load proxies from env
return session
def _prepare_headers(self, _, headers):
add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
headers.setdefault('Connection', 'keep-alive')
def _send(self, request):
headers = self._get_headers(request)
max_redirects_exceeded = False
session = self._get_instance(
cookiejar=self._get_cookiejar(request),
legacy_ssl_support=request.extensions.get('legacy_ssl'),
)
try:
requests_res = session.request(
method=request.method,
url=request.url,
data=request.data,
headers=headers,
timeout=self._calculate_timeout(request),
proxies=self._get_proxies(request),
allow_redirects=True,
stream=True,
)
except requests.exceptions.TooManyRedirects as e:
max_redirects_exceeded = True
requests_res = e.response
except requests.exceptions.SSLError as e:
if 'CERTIFICATE_VERIFY_FAILED' in str(e):
raise CertificateVerifyError(cause=e) from e
raise SSLError(cause=e) from e
except requests.exceptions.ProxyError as e:
raise ProxyError(cause=e) from e
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
raise TransportError(cause=e) from e
except urllib3.exceptions.HTTPError as e:
# Catch any urllib3 exceptions that may leak through
raise TransportError(cause=e) from e
except requests.exceptions.RequestException as e:
# Miscellaneous Requests exceptions. May not necessary be network related e.g. InvalidURL
raise RequestError(cause=e) from e
res = RequestsResponseAdapter(requests_res)
if not 200 <= res.status < 300:
raise HTTPError(res, redirect_loop=max_redirects_exceeded)
return res
@register_preference(RequestsRH)
def requests_preference(rh, request):
return 100
# Use our socks proxy implementation with requests to avoid an extra dependency.
class SocksHTTPConnection(urllib3.connection.HTTPConnection):
def __init__(self, _socks_options, *args, **kwargs): # must use _socks_options to pass PoolKey checks
self._proxy_args = _socks_options
super().__init__(*args, **kwargs)
def _new_conn(self):
try:
return create_connection(
address=(self._proxy_args['addr'], self._proxy_args['port']),
timeout=self.timeout,
source_address=self.source_address,
_create_socket_func=functools.partial(
create_socks_proxy_socket, (self.host, self.port), self._proxy_args))
except TimeoutError as e:
raise urllib3.exceptions.ConnectTimeoutError(
self, f'Connection to {self.host} timed out. (connect timeout={self.timeout})') from e
except SocksProxyError as e:
raise urllib3.exceptions.ProxyError(str(e), e) from e
except OSError as e:
raise urllib3.exceptions.NewConnectionError(
self, f'Failed to establish a new connection: {e}') from e
class SocksHTTPSConnection(SocksHTTPConnection, urllib3.connection.HTTPSConnection):
pass
class SocksHTTPConnectionPool(urllib3.HTTPConnectionPool):
ConnectionCls = SocksHTTPConnection
class SocksHTTPSConnectionPool(urllib3.HTTPSConnectionPool):
ConnectionCls = SocksHTTPSConnection
class SocksProxyManager(urllib3.PoolManager):
def __init__(self, socks_proxy, username=None, password=None, num_pools=10, headers=None, **connection_pool_kw):
connection_pool_kw['_socks_options'] = make_socks_proxy_opts(socks_proxy)
super().__init__(num_pools, headers, **connection_pool_kw)
self.pool_classes_by_scheme = {
'http': SocksHTTPConnectionPool,
'https': SocksHTTPSConnectionPool,
}
requests.adapters.SOCKSProxyManager = SocksProxyManager

View File

@@ -0,0 +1,425 @@
from __future__ import annotations
import functools
import http.client
import io
import ssl
import urllib.error
import urllib.parse
import urllib.request
import urllib.response
import zlib
from urllib.request import (
DataHandler,
FileHandler,
FTPHandler,
HTTPCookieProcessor,
HTTPDefaultErrorHandler,
HTTPErrorProcessor,
UnknownHandler,
)
from ._helper import (
InstanceStoreMixin,
add_accept_encoding_header,
create_connection,
create_socks_proxy_socket,
get_redirect_method,
make_socks_proxy_opts,
)
from .common import Features, RequestHandler, Response, register_rh
from .exceptions import (
CertificateVerifyError,
HTTPError,
IncompleteRead,
ProxyError,
RequestError,
SSLError,
TransportError,
)
from ..dependencies import brotli
from ..socks import ProxyError as SocksProxyError
from ..utils import update_url_query
from ..utils.networking import normalize_url, select_proxy
SUPPORTED_ENCODINGS = ['gzip', 'deflate']
CONTENT_DECODE_ERRORS = [zlib.error, OSError]
if brotli:
SUPPORTED_ENCODINGS.append('br')
CONTENT_DECODE_ERRORS.append(brotli.error)
def _create_http_connection(http_class, source_address, *args, **kwargs):
hc = http_class(*args, **kwargs)
if hasattr(hc, '_create_connection'):
hc._create_connection = create_connection
if source_address is not None:
hc.source_address = (source_address, 0)
return hc
class HTTPHandler(urllib.request.AbstractHTTPHandler):
"""Handler for HTTP requests and responses.
This class, when installed with an OpenerDirector, automatically adds
the standard headers to every HTTP request and handles gzipped, deflated and
brotli responses from web servers.
Part of this code was copied from:
http://techknack.net/python-urllib2-handlers/
Andrew Rowls, the author of that code, agreed to release it to the
public domain.
"""
def __init__(self, context=None, source_address=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self._source_address = source_address
self._context = context
@staticmethod
def _make_conn_class(base, req):
conn_class = base
socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
if socks_proxy:
conn_class = make_socks_conn_class(conn_class, socks_proxy)
return conn_class
def http_open(self, req):
conn_class = self._make_conn_class(http.client.HTTPConnection, req)
return self.do_open(functools.partial(
_create_http_connection, conn_class, self._source_address), req)
def https_open(self, req):
conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
return self.do_open(
functools.partial(
_create_http_connection, conn_class, self._source_address),
req, context=self._context)
@staticmethod
def deflate(data):
if not data:
return data
try:
return zlib.decompress(data, -zlib.MAX_WBITS)
except zlib.error:
return zlib.decompress(data)
@staticmethod
def brotli(data):
if not data:
return data
return brotli.decompress(data)
@staticmethod
def gz(data):
# There may be junk added the end of the file
# We ignore it by only ever decoding a single gzip payload
if not data:
return data
return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)
def http_request(self, req):
# According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
# always respected by websites, some tend to give out URLs with non percent-encoded
# non-ASCII characters (see telemb.py, ard.py [#3412])
# urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
# To work around aforementioned issue we will replace request's original URL with
# percent-encoded one
# Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
# the code of this workaround has been moved here from YoutubeDL.urlopen()
url = req.get_full_url()
url_escaped = normalize_url(url)
# Substitute URL if any change after escaping
if url != url_escaped:
req = update_Request(req, url=url_escaped)
return super().do_request_(req)
def http_response(self, req, resp):
old_resp = resp
# Content-Encoding header lists the encodings in order that they were applied [1].
# To decompress, we simply do the reverse.
# [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
decoded_response = None
for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
if encoding == 'gzip':
decoded_response = self.gz(decoded_response or resp.read())
elif encoding == 'deflate':
decoded_response = self.deflate(decoded_response or resp.read())
elif encoding == 'br' and brotli:
decoded_response = self.brotli(decoded_response or resp.read())
if decoded_response is not None:
resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
# Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
# https://github.com/ytdl-org/youtube-dl/issues/6457).
if 300 <= resp.code < 400:
location = resp.headers.get('Location')
if location:
# As of RFC 2616 default charset is iso-8859-1 that is respected by Python 3
location = location.encode('iso-8859-1').decode()
location_escaped = normalize_url(location)
if location != location_escaped:
del resp.headers['Location']
resp.headers['Location'] = location_escaped
return resp
https_request = http_request
https_response = http_response
def make_socks_conn_class(base_class, socks_proxy):
assert issubclass(base_class, (
http.client.HTTPConnection, http.client.HTTPSConnection))
proxy_args = make_socks_proxy_opts(socks_proxy)
class SocksConnection(base_class):
_create_connection = create_connection
def connect(self):
self.sock = create_connection(
(proxy_args['addr'], proxy_args['port']),
timeout=self.timeout,
source_address=self.source_address,
_create_socket_func=functools.partial(
create_socks_proxy_socket, (self.host, self.port), proxy_args))
if isinstance(self, http.client.HTTPSConnection):
self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
return SocksConnection
class RedirectHandler(urllib.request.HTTPRedirectHandler):
"""YoutubeDL redirect handler
The code is based on HTTPRedirectHandler implementation from CPython [1].
This redirect handler fixes and improves the logic to better align with RFC7261
and what browsers tend to do [2][3]
1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
2. https://datatracker.ietf.org/doc/html/rfc7231
3. https://github.com/python/cpython/issues/91306
"""
http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
def redirect_request(self, req, fp, code, msg, headers, newurl):
if code not in (301, 302, 303, 307, 308):
raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
new_data = req.data
# Technically the Cookie header should be in unredirected_hdrs,
# however in practice some may set it in normal headers anyway.
# We will remove it here to prevent any leaks.
remove_headers = ['Cookie']
new_method = get_redirect_method(req.get_method(), code)
# only remove payload if method changed (e.g. POST to GET)
if new_method != req.get_method():
new_data = None
remove_headers.extend(['Content-Length', 'Content-Type'])
new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
return urllib.request.Request(
newurl, headers=new_headers, origin_req_host=req.origin_req_host,
unverifiable=True, method=new_method, data=new_data)
class ProxyHandler(urllib.request.BaseHandler):
handler_order = 100
def __init__(self, proxies=None):
self.proxies = proxies
# Set default handlers
for scheme in ('http', 'https', 'ftp'):
setattr(self, f'{scheme}_open', lambda r, meth=self.proxy_open: meth(r))
def proxy_open(self, req):
proxy = select_proxy(req.get_full_url(), self.proxies)
if proxy is None:
return
if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
req.add_header('Ytdl-socks-proxy', proxy)
# yt-dlp's http/https handlers do wrapping the socket with socks
return None
return urllib.request.ProxyHandler.proxy_open(
self, req, proxy, None)
class PUTRequest(urllib.request.Request):
def get_method(self):
return 'PUT'
class HEADRequest(urllib.request.Request):
def get_method(self):
return 'HEAD'
def update_Request(req, url=None, data=None, headers=None, query=None):
req_headers = req.headers.copy()
req_headers.update(headers or {})
req_data = data if data is not None else req.data
req_url = update_url_query(url or req.get_full_url(), query)
req_get_method = req.get_method()
if req_get_method == 'HEAD':
req_type = HEADRequest
elif req_get_method == 'PUT':
req_type = PUTRequest
else:
req_type = urllib.request.Request
new_req = req_type(
req_url, data=req_data, headers=req_headers,
origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
if hasattr(req, 'timeout'):
new_req.timeout = req.timeout
return new_req
class UrllibResponseAdapter(Response):
"""
HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
"""
def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
# addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
# HTTPResponse: .getcode() was deprecated, .status always existed [2]
# 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
# 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
super().__init__(
fp=res, headers=res.headers, url=res.url,
status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
def read(self, amt=None):
try:
return self.fp.read(amt)
except Exception as e:
handle_response_read_exceptions(e)
raise e
def handle_sslerror(e: ssl.SSLError):
if not isinstance(e, ssl.SSLError):
return
if isinstance(e, ssl.SSLCertVerificationError):
raise CertificateVerifyError(cause=e) from e
raise SSLError(cause=e) from e
def handle_response_read_exceptions(e):
if isinstance(e, http.client.IncompleteRead):
raise IncompleteRead(partial=len(e.partial), cause=e, expected=e.expected) from e
elif isinstance(e, ssl.SSLError):
handle_sslerror(e)
elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
# OSErrors raised here should mostly be network related
raise TransportError(cause=e) from e
@register_rh
class UrllibRH(RequestHandler, InstanceStoreMixin):
_SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
_SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
_SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
RH_NAME = 'urllib'
def __init__(self, *, enable_file_urls: bool = False, **kwargs):
super().__init__(**kwargs)
self.enable_file_urls = enable_file_urls
if self.enable_file_urls:
self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
def _check_extensions(self, extensions):
super()._check_extensions(extensions)
extensions.pop('cookiejar', None)
extensions.pop('timeout', None)
extensions.pop('legacy_ssl', None)
def _create_instance(self, proxies, cookiejar, legacy_ssl_support=None):
opener = urllib.request.OpenerDirector()
handlers = [
ProxyHandler(proxies),
HTTPHandler(
debuglevel=int(bool(self.verbose)),
context=self._make_sslcontext(legacy_ssl_support=legacy_ssl_support),
source_address=self.source_address),
HTTPCookieProcessor(cookiejar),
DataHandler(),
UnknownHandler(),
HTTPDefaultErrorHandler(),
FTPHandler(),
HTTPErrorProcessor(),
RedirectHandler(),
]
if self.enable_file_urls:
handlers.append(FileHandler())
for handler in handlers:
opener.add_handler(handler)
# Delete the default user-agent header, which would otherwise apply in
# cases where our custom HTTP handler doesn't come into play
# (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
opener.addheaders = []
return opener
def _prepare_headers(self, _, headers):
add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
def _send(self, request):
headers = self._get_headers(request)
urllib_req = urllib.request.Request(
url=request.url,
data=request.data,
headers=headers,
method=request.method,
)
opener = self._get_instance(
proxies=self._get_proxies(request),
cookiejar=self._get_cookiejar(request),
legacy_ssl_support=request.extensions.get('legacy_ssl'),
)
try:
res = opener.open(urllib_req, timeout=self._calculate_timeout(request))
except urllib.error.HTTPError as e:
if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
# Prevent file object from being closed when urllib.error.HTTPError is destroyed.
e._closer.close_called = True
raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
raise # unexpected
except urllib.error.URLError as e:
cause = e.reason # NOTE: cause may be a string
# proxy errors
if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
raise ProxyError(cause=e) from e
handle_response_read_exceptions(cause)
raise TransportError(cause=e) from e
except (http.client.InvalidURL, ValueError) as e:
# Validation errors
# http.client.HTTPConnection raises ValueError in some validation cases
# such as if request method contains illegal control characters [1]
# 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
raise RequestError(cause=e) from e
except Exception as e:
handle_response_read_exceptions(e)
raise # unexpected
return UrllibResponseAdapter(res)

View File

@@ -0,0 +1,189 @@
from __future__ import annotations
import contextlib
import functools
import io
import logging
import ssl
import sys
from ._helper import (
create_connection,
create_socks_proxy_socket,
make_socks_proxy_opts,
)
from ..utils.networking import select_proxy
from .common import Features, Response, register_rh
from .exceptions import (
CertificateVerifyError,
HTTPError,
ProxyError,
RequestError,
SSLError,
TransportError,
)
from .websocket import WebSocketRequestHandler, WebSocketResponse
from ..dependencies import websockets
from ..socks import ProxyError as SocksProxyError
from ..utils import int_or_none
if not websockets:
raise ImportError('websockets is not installed')
import websockets.version
websockets_version = tuple(map(int_or_none, websockets.version.version.split('.')))
if websockets_version < (13, 0):
websockets._yt_dlp__version = f'{websockets.version.version} (unsupported)'
raise ImportError('Only websockets>=13.0 is supported')
import websockets.sync.client
from websockets.uri import parse_uri
# In websockets Connection, recv_exc and recv_events_exc are defined
# after the recv events handler thread is started [1].
# On our CI using PyPy, in some cases a race condition may occur
# where the recv events handler thread tries to use these attributes before they are defined [2].
# 1: https://github.com/python-websockets/websockets/blame/de768cf65e7e2b1a3b67854fb9e08816a5ff7050/src/websockets/sync/connection.py#L93
# 2: "AttributeError: 'ClientConnection' object has no attribute 'recv_events_exc'. Did you mean: 'recv_events'?"
import websockets.sync.connection # isort: split
with contextlib.suppress(Exception):
websockets.sync.connection.Connection.recv_exc = None
class WebsocketsResponseAdapter(WebSocketResponse):
def __init__(self, ws: websockets.sync.client.ClientConnection, url):
super().__init__(
fp=io.BytesIO(ws.response.body or b''),
url=url,
headers=ws.response.headers,
status=ws.response.status_code,
reason=ws.response.reason_phrase,
)
self._ws = ws
def close(self):
self._ws.close()
super().close()
def send(self, message):
# https://websockets.readthedocs.io/en/stable/reference/sync/client.html#websockets.sync.client.ClientConnection.send
try:
return self._ws.send(message)
except (websockets.exceptions.WebSocketException, RuntimeError, TimeoutError) as e:
raise TransportError(cause=e) from e
except SocksProxyError as e:
raise ProxyError(cause=e) from e
except TypeError as e:
raise RequestError(cause=e) from e
def recv(self):
# https://websockets.readthedocs.io/en/stable/reference/sync/client.html#websockets.sync.client.ClientConnection.recv
try:
return self._ws.recv()
except SocksProxyError as e:
raise ProxyError(cause=e) from e
except (websockets.exceptions.WebSocketException, RuntimeError, TimeoutError) as e:
raise TransportError(cause=e) from e
@register_rh
class WebsocketsRH(WebSocketRequestHandler):
"""
Websockets request handler
https://websockets.readthedocs.io
https://github.com/python-websockets/websockets
"""
_SUPPORTED_URL_SCHEMES = ('wss', 'ws')
_SUPPORTED_PROXY_SCHEMES = ('socks4', 'socks4a', 'socks5', 'socks5h')
_SUPPORTED_FEATURES = (Features.ALL_PROXY, Features.NO_PROXY)
RH_NAME = 'websockets'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.__logging_handlers = {}
for name in ('websockets.client', 'websockets.server'):
logger = logging.getLogger(name)
handler = logging.StreamHandler(stream=sys.stdout)
handler.setFormatter(logging.Formatter(f'{self.RH_NAME}: %(message)s'))
self.__logging_handlers[name] = handler
logger.addHandler(handler)
if self.verbose:
logger.setLevel(logging.DEBUG)
def _check_extensions(self, extensions):
super()._check_extensions(extensions)
extensions.pop('timeout', None)
extensions.pop('cookiejar', None)
extensions.pop('legacy_ssl', None)
extensions.pop('keep_header_casing', None)
def close(self):
# Remove the logging handler that contains a reference to our logger
# See: https://github.com/yt-dlp/yt-dlp/issues/8922
for name, handler in self.__logging_handlers.items():
logging.getLogger(name).removeHandler(handler)
def _prepare_headers(self, request, headers):
if 'cookie' not in headers:
cookiejar = self._get_cookiejar(request)
cookie_header = cookiejar.get_cookie_header(request.url)
if cookie_header:
headers['cookie'] = cookie_header
def _send(self, request):
timeout = self._calculate_timeout(request)
headers = self._get_headers(request)
wsuri = parse_uri(request.url)
create_conn_kwargs = {
'source_address': (self.source_address, 0) if self.source_address else None,
'timeout': timeout,
}
proxy = select_proxy(request.url, self._get_proxies(request))
try:
if proxy:
socks_proxy_options = make_socks_proxy_opts(proxy)
sock = create_connection(
address=(socks_proxy_options['addr'], socks_proxy_options['port']),
_create_socket_func=functools.partial(
create_socks_proxy_socket, (wsuri.host, wsuri.port), socks_proxy_options),
**create_conn_kwargs,
)
else:
sock = create_connection(
address=(wsuri.host, wsuri.port),
**create_conn_kwargs,
)
ssl_ctx = self._make_sslcontext(legacy_ssl_support=request.extensions.get('legacy_ssl'))
conn = websockets.sync.client.connect(
sock=sock,
uri=request.url,
additional_headers=headers,
open_timeout=timeout,
user_agent_header=None,
ssl=ssl_ctx if wsuri.secure else None,
close_timeout=0, # not ideal, but prevents yt-dlp hanging
)
return WebsocketsResponseAdapter(conn, url=request.url)
# Exceptions as per https://websockets.readthedocs.io/en/stable/reference/sync/client.html
except SocksProxyError as e:
raise ProxyError(cause=e) from e
except websockets.exceptions.InvalidURI as e:
raise RequestError(cause=e) from e
except ssl.SSLCertVerificationError as e:
raise CertificateVerifyError(cause=e) from e
except ssl.SSLError as e:
raise SSLError(cause=e) from e
except websockets.exceptions.InvalidStatus as e:
raise HTTPError(
Response(
fp=io.BytesIO(e.response.body),
url=request.url,
headers=e.response.headers,
status=e.response.status_code,
reason=e.response.reason_phrase),
) from e
except (OSError, TimeoutError, websockets.exceptions.WebSocketException) as e:
raise TransportError(cause=e) from e

View File

@@ -0,0 +1,604 @@
from __future__ import annotations
import abc
import copy
import enum
import functools
import io
import typing
import urllib.parse
import urllib.request
import urllib.response
from collections.abc import Iterable, Mapping
from email.message import Message
from http import HTTPStatus
from types import NoneType
from ._helper import make_ssl_context, wrap_request_errors
from .exceptions import (
NoSupportingHandlers,
RequestError,
TransportError,
UnsupportedRequest,
)
from ..cookies import YoutubeDLCookieJar
from ..utils import (
bug_reports_message,
classproperty,
deprecation_warning,
error_to_str,
update_url_query,
)
from ..utils.networking import HTTPHeaderDict, normalize_url
DEFAULT_TIMEOUT = 20
def register_preference(*handlers: type[RequestHandler]):
assert all(issubclass(handler, RequestHandler) for handler in handlers)
def outer(preference: Preference):
@functools.wraps(preference)
def inner(handler, *args, **kwargs):
if not handlers or isinstance(handler, handlers):
return preference(handler, *args, **kwargs)
return 0
_RH_PREFERENCES.add(inner)
return inner
return outer
class RequestDirector:
"""RequestDirector class
Helper class that, when given a request, forward it to a RequestHandler that supports it.
Preference functions in the form of func(handler, request) -> int
can be registered into the `preferences` set. These are used to sort handlers
in order of preference.
@param logger: Logger instance.
@param verbose: Print debug request information to stdout.
"""
def __init__(self, logger, verbose=False):
self.handlers: dict[str, RequestHandler] = {}
self.preferences: set[Preference] = set()
self.logger = logger # TODO(Grub4k): default logger
self.verbose = verbose
def close(self):
for handler in self.handlers.values():
handler.close()
self.handlers.clear()
def add_handler(self, handler: RequestHandler):
"""Add a handler. If a handler of the same RH_KEY exists, it will overwrite it"""
assert isinstance(handler, RequestHandler), 'handler must be a RequestHandler'
self.handlers[handler.RH_KEY] = handler
def _get_handlers(self, request: Request) -> list[RequestHandler]:
"""Sorts handlers by preference, given a request"""
preferences = {
rh: sum(pref(rh, request) for pref in self.preferences)
for rh in self.handlers.values()
}
self._print_verbose('Handler preferences for this request: {}'.format(', '.join(
f'{rh.RH_NAME}={pref}' for rh, pref in preferences.items())))
return sorted(self.handlers.values(), key=preferences.get, reverse=True)
def _print_verbose(self, msg):
if self.verbose:
self.logger.stdout(f'director: {msg}')
def send(self, request: Request) -> Response:
"""
Passes a request onto a suitable RequestHandler
"""
if not self.handlers:
raise RequestError('No request handlers configured')
assert isinstance(request, Request)
unexpected_errors = []
unsupported_errors = []
for handler in self._get_handlers(request):
self._print_verbose(f'Checking if "{handler.RH_NAME}" supports this request.')
try:
handler.validate(request)
except UnsupportedRequest as e:
self._print_verbose(
f'"{handler.RH_NAME}" cannot handle this request (reason: {error_to_str(e)})')
unsupported_errors.append(e)
continue
self._print_verbose(f'Sending request via "{handler.RH_NAME}"')
try:
response = handler.send(request)
except RequestError:
raise
except Exception as e:
self.logger.error(
f'[{handler.RH_NAME}] Unexpected error: {error_to_str(e)}{bug_reports_message()}',
is_error=False)
unexpected_errors.append(e)
continue
assert isinstance(response, Response)
return response
raise NoSupportingHandlers(unsupported_errors, unexpected_errors)
_REQUEST_HANDLERS = {}
def register_rh(handler):
"""Register a RequestHandler class"""
assert issubclass(handler, RequestHandler), f'{handler} must be a subclass of RequestHandler'
assert handler.RH_KEY not in _REQUEST_HANDLERS, f'RequestHandler {handler.RH_KEY} already registered'
_REQUEST_HANDLERS[handler.RH_KEY] = handler
return handler
class Features(enum.Enum):
ALL_PROXY = enum.auto()
NO_PROXY = enum.auto()
class RequestHandler(abc.ABC):
"""Request Handler class
Request handlers are class that, given a Request,
process the request from start to finish and return a Response.
Concrete subclasses need to redefine the _send(request) method,
which handles the underlying request logic and returns a Response.
RH_NAME class variable may contain a display name for the RequestHandler.
By default, this is generated from the class name.
The concrete request handler MUST have "RH" as the suffix in the class name.
All exceptions raised by a RequestHandler should be an instance of RequestError.
Any other exception raised will be treated as a handler issue.
If a Request is not supported by the handler, an UnsupportedRequest
should be raised with a reason.
By default, some checks are done on the request in _validate() based on the following class variables:
- `_SUPPORTED_URL_SCHEMES`: a tuple of supported url schemes.
Any Request with an url scheme not in this list will raise an UnsupportedRequest.
- `_SUPPORTED_PROXY_SCHEMES`: a tuple of support proxy url schemes. Any Request that contains
a proxy url with an url scheme not in this list will raise an UnsupportedRequest.
- `_SUPPORTED_FEATURES`: a tuple of supported features, as defined in Features enum.
The above may be set to None to disable the checks.
Parameters:
@param logger: logger instance
@param headers: HTTP Headers to include when sending requests.
@param cookiejar: Cookiejar to use for requests.
@param timeout: Socket timeout to use when sending requests.
@param proxies: Proxies to use for sending requests.
@param source_address: Client-side IP address to bind to for requests.
@param verbose: Print debug request and traffic information to stdout.
@param prefer_system_certs: Whether to prefer system certificates over other means (e.g. certifi).
@param client_cert: SSL client certificate configuration.
dict with {client_certificate, client_certificate_key, client_certificate_password}
@param verify: Verify SSL certificates
@param legacy_ssl_support: Enable legacy SSL options such as legacy server connect and older cipher support.
Some configuration options may be available for individual Requests too. In this case,
either the Request configuration option takes precedence or they are merged.
Requests may have additional optional parameters defined as extensions.
RequestHandler subclasses may choose to support custom extensions.
If an extension is supported, subclasses should extend _check_extensions(extensions)
to pop and validate the extension.
- Extensions left in `extensions` are treated as unsupported and UnsupportedRequest will be raised.
The following extensions are defined for RequestHandler:
- `cookiejar`: Cookiejar to use for this request.
- `timeout`: socket timeout to use for this request.
- `legacy_ssl`: Enable legacy SSL options for this request. See legacy_ssl_support.
- `keep_header_casing`: Keep the casing of headers when sending the request.
To enable these, add extensions.pop('<extension>', None) to _check_extensions
Apart from the url protocol, proxies dict may contain the following keys:
- `all`: proxy to use for all protocols. Used as a fallback if no proxy is set for a specific protocol.
- `no`: comma seperated list of hostnames (optionally with port) to not use a proxy for.
Note: a RequestHandler may not support these, as defined in `_SUPPORTED_FEATURES`.
"""
_SUPPORTED_URL_SCHEMES = ()
_SUPPORTED_PROXY_SCHEMES = ()
_SUPPORTED_FEATURES = ()
def __init__(
self, *,
logger, # TODO(Grub4k): default logger
headers: HTTPHeaderDict = None,
cookiejar: YoutubeDLCookieJar = None,
timeout: float | int | None = None,
proxies: dict | None = None,
source_address: str | None = None,
verbose: bool = False,
prefer_system_certs: bool = False,
client_cert: dict[str, str | None] | None = None,
verify: bool = True,
legacy_ssl_support: bool = False,
**_,
):
self._logger = logger
self.headers = headers or {}
self.cookiejar = cookiejar if cookiejar is not None else YoutubeDLCookieJar()
self.timeout = float(timeout or DEFAULT_TIMEOUT)
self.proxies = proxies or {}
self.source_address = source_address
self.verbose = verbose
self.prefer_system_certs = prefer_system_certs
self._client_cert = client_cert or {}
self.verify = verify
self.legacy_ssl_support = legacy_ssl_support
super().__init__()
def _make_sslcontext(self, legacy_ssl_support=None):
return make_ssl_context(
verify=self.verify,
legacy_support=legacy_ssl_support if legacy_ssl_support is not None else self.legacy_ssl_support,
use_certifi=not self.prefer_system_certs,
**self._client_cert,
)
def _merge_headers(self, request_headers):
return HTTPHeaderDict(self.headers, request_headers)
def _prepare_headers(self, request: Request, headers: HTTPHeaderDict) -> None: # noqa: B027
"""Additional operations to prepare headers before building. To be extended by subclasses.
@param request: Request object
@param headers: Merged headers to prepare
"""
def _get_headers(self, request: Request) -> dict[str, str]:
"""
Get headers for external use.
Subclasses may define a _prepare_headers method to modify headers after merge but before building.
"""
headers = self._merge_headers(request.headers)
self._prepare_headers(request, headers)
if request.extensions.get('keep_header_casing'):
return headers.sensitive()
return dict(headers)
def _calculate_timeout(self, request):
return float(request.extensions.get('timeout') or self.timeout)
def _get_cookiejar(self, request):
cookiejar = request.extensions.get('cookiejar')
return self.cookiejar if cookiejar is None else cookiejar
def _get_proxies(self, request):
return (request.proxies or self.proxies).copy()
def _check_url_scheme(self, request: Request):
scheme = urllib.parse.urlparse(request.url).scheme.lower()
if self._SUPPORTED_URL_SCHEMES is not None and scheme not in self._SUPPORTED_URL_SCHEMES:
raise UnsupportedRequest(f'Unsupported url scheme: "{scheme}"')
return scheme # for further processing
def _check_proxies(self, proxies):
for proxy_key, proxy_url in proxies.items():
if proxy_url is None:
continue
if proxy_key == 'no':
if self._SUPPORTED_FEATURES is not None and Features.NO_PROXY not in self._SUPPORTED_FEATURES:
raise UnsupportedRequest('"no" proxy is not supported')
continue
if (
proxy_key == 'all'
and self._SUPPORTED_FEATURES is not None
and Features.ALL_PROXY not in self._SUPPORTED_FEATURES
):
raise UnsupportedRequest('"all" proxy is not supported')
# Unlikely this handler will use this proxy, so ignore.
# This is to allow a case where a proxy may be set for a protocol
# for one handler in which such protocol (and proxy) is not supported by another handler.
if self._SUPPORTED_URL_SCHEMES is not None and proxy_key not in (*self._SUPPORTED_URL_SCHEMES, 'all'):
continue
if self._SUPPORTED_PROXY_SCHEMES is None:
# Skip proxy scheme checks
continue
try:
if urllib.request._parse_proxy(proxy_url)[0] is None:
# Scheme-less proxies are not supported
raise UnsupportedRequest(f'Proxy "{proxy_url}" missing scheme')
except ValueError as e:
# parse_proxy may raise on some invalid proxy urls such as "/a/b/c"
raise UnsupportedRequest(f'Invalid proxy url "{proxy_url}": {e}')
scheme = urllib.parse.urlparse(proxy_url).scheme.lower()
if scheme not in self._SUPPORTED_PROXY_SCHEMES:
raise UnsupportedRequest(f'Unsupported proxy type: "{scheme}"')
def _check_extensions(self, extensions):
"""Check extensions for unsupported extensions. Subclasses should extend this."""
assert isinstance(extensions.get('cookiejar'), (YoutubeDLCookieJar, NoneType))
assert isinstance(extensions.get('timeout'), (float, int, NoneType))
assert isinstance(extensions.get('legacy_ssl'), (bool, NoneType))
assert isinstance(extensions.get('keep_header_casing'), (bool, NoneType))
def _validate(self, request):
self._check_url_scheme(request)
self._check_proxies(request.proxies or self.proxies)
extensions = request.extensions.copy()
self._check_extensions(extensions)
if extensions:
# TODO: add support for optional extensions
raise UnsupportedRequest(f'Unsupported extensions: {", ".join(extensions.keys())}')
@wrap_request_errors
def validate(self, request: Request):
if not isinstance(request, Request):
raise TypeError('Expected an instance of Request')
self._validate(request)
@wrap_request_errors
def send(self, request: Request) -> Response:
if not isinstance(request, Request):
raise TypeError('Expected an instance of Request')
return self._send(request)
@abc.abstractmethod
def _send(self, request: Request):
"""Handle a request from start to finish. Redefine in subclasses."""
pass
def close(self): # noqa: B027
pass
@classproperty
def RH_NAME(cls):
return cls.__name__[:-2]
@classproperty
def RH_KEY(cls):
assert cls.__name__.endswith('RH'), 'RequestHandler class names must end with "RH"'
return cls.__name__[:-2]
def __enter__(self):
return self
def __exit__(self, *args):
self.close()
class Request:
"""
Represents a request to be made.
Partially backwards-compatible with urllib.request.Request.
@param url: url to send. Will be sanitized.
@param data: payload data to send. Must be bytes, iterable of bytes, a file-like object or None
@param headers: headers to send.
@param proxies: proxy dict mapping of proto:proxy to use for the request and any redirects.
@param query: URL query parameters to update the url with.
@param method: HTTP method to use. If no method specified, will use POST if payload data is present else GET
@param extensions: Dictionary of Request extensions to add, as supported by handlers.
"""
def __init__(
self,
url: str,
data: RequestData = None,
headers: typing.Mapping | None = None,
proxies: dict | None = None,
query: dict | None = None,
method: str | None = None,
extensions: dict | None = None,
):
self._headers = HTTPHeaderDict()
self._data = None
if query:
url = update_url_query(url, query)
self.url = url
self.method = method
if headers:
self.headers = headers
self.data = data # note: must be done after setting headers
self.proxies = proxies or {}
self.extensions = extensions or {}
@property
def url(self):
return self._url
@url.setter
def url(self, url):
if not isinstance(url, str):
raise TypeError('url must be a string')
elif url.startswith('//'):
url = 'http:' + url
self._url = normalize_url(url)
@property
def method(self):
return self._method or ('POST' if self.data is not None else 'GET')
@method.setter
def method(self, method):
if method is None:
self._method = None
elif isinstance(method, str):
self._method = method.upper()
else:
raise TypeError('method must be a string')
@property
def data(self):
return self._data
@data.setter
def data(self, data: RequestData):
# Try catch some common mistakes
if data is not None and (
not isinstance(data, (bytes, io.IOBase, Iterable)) or isinstance(data, (str, Mapping))
):
raise TypeError('data must be bytes, iterable of bytes, or a file-like object')
if data == self._data and self._data is None:
self.headers.pop('Content-Length', None)
# https://docs.python.org/3/library/urllib.request.html#urllib.request.Request.data
if data != self._data:
if self._data is not None:
self.headers.pop('Content-Length', None)
self._data = data
if self._data is None:
self.headers.pop('Content-Type', None)
if 'Content-Type' not in self.headers and self._data is not None:
self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
@property
def headers(self) -> HTTPHeaderDict:
return self._headers
@headers.setter
def headers(self, new_headers: Mapping):
"""Replaces headers of the request. If not a HTTPHeaderDict, it will be converted to one."""
if isinstance(new_headers, HTTPHeaderDict):
self._headers = new_headers
elif isinstance(new_headers, Mapping):
self._headers = HTTPHeaderDict(new_headers)
else:
raise TypeError('headers must be a mapping')
def update(self, url=None, data=None, headers=None, query=None, extensions=None):
self.data = data if data is not None else self.data
self.headers.update(headers or {})
self.extensions.update(extensions or {})
self.url = update_url_query(url or self.url, query or {})
def copy(self):
return self.__class__(
url=self.url,
headers=copy.deepcopy(self.headers),
proxies=copy.deepcopy(self.proxies),
data=self._data,
extensions=copy.copy(self.extensions),
method=self._method,
)
HEADRequest = functools.partial(Request, method='HEAD')
PATCHRequest = functools.partial(Request, method='PATCH')
PUTRequest = functools.partial(Request, method='PUT')
class Response(io.IOBase):
"""
Base class for HTTP response adapters.
By default, it provides a basic wrapper for a file-like response object.
Interface partially backwards-compatible with addinfourl and http.client.HTTPResponse.
@param fp: Original, file-like, response.
@param url: URL that this is a response of.
@param headers: response headers.
@param status: Response HTTP status code. Default is 200 OK.
@param reason: HTTP status reason. Will use built-in reasons based on status code if not provided.
@param extensions: Dictionary of handler-specific response extensions.
"""
def __init__(
self,
fp: io.IOBase,
url: str,
headers: Mapping[str, str],
status: int = 200,
reason: str | None = None,
extensions: dict | None = None,
):
self.fp = fp
self.headers = Message()
for name, value in headers.items():
self.headers.add_header(name, value)
self.status = status
self.url = url
try:
self.reason = reason or HTTPStatus(status).phrase
except ValueError:
self.reason = None
self.extensions = extensions or {}
def readable(self):
return self.fp.readable()
def read(self, amt: int | None = None) -> bytes:
# Expected errors raised here should be of type RequestError or subclasses.
# Subclasses should redefine this method with more precise error handling.
try:
return self.fp.read(amt)
except Exception as e:
raise TransportError(cause=e) from e
def close(self):
self.fp.close()
return super().close()
def get_header(self, name, default=None):
"""Get header for name.
If there are multiple matching headers, return all seperated by comma."""
headers = self.headers.get_all(name)
if not headers:
return default
if name.title() == 'Set-Cookie':
# Special case, only get the first one
# https://www.rfc-editor.org/rfc/rfc9110.html#section-5.3-4.1
return headers[0]
return ', '.join(headers)
# The following methods are for compatability reasons and are deprecated
@property
def code(self):
deprecation_warning('Response.code is deprecated, use Response.status', stacklevel=2)
return self.status
def getcode(self):
deprecation_warning('Response.getcode() is deprecated, use Response.status', stacklevel=2)
return self.status
def geturl(self):
deprecation_warning('Response.geturl() is deprecated, use Response.url', stacklevel=2)
return self.url
def info(self):
deprecation_warning('Response.info() is deprecated, use Response.headers', stacklevel=2)
return self.headers
def getheader(self, name, default=None):
deprecation_warning('Response.getheader() is deprecated, use Response.get_header', stacklevel=2)
return self.get_header(name, default)
if typing.TYPE_CHECKING:
RequestData = bytes | Iterable[bytes] | typing.IO | None
Preference = typing.Callable[[RequestHandler, Request], int]
_RH_PREFERENCES: set[Preference] = set()

View File

@@ -0,0 +1,103 @@
from __future__ import annotations
import typing
from ..utils import YoutubeDLError
if typing.TYPE_CHECKING:
from .common import RequestHandler, Response
class RequestError(YoutubeDLError):
def __init__(
self,
msg: str | None = None,
cause: Exception | str | None = None,
handler: RequestHandler = None,
):
self.handler = handler
self.cause = cause
if not msg and cause:
msg = str(cause)
super().__init__(msg)
class UnsupportedRequest(RequestError):
"""raised when a handler cannot handle a request"""
pass
class NoSupportingHandlers(RequestError):
"""raised when no handlers can support a request for various reasons"""
def __init__(self, unsupported_errors: list[UnsupportedRequest], unexpected_errors: list[Exception]):
self.unsupported_errors = unsupported_errors or []
self.unexpected_errors = unexpected_errors or []
# Print a quick summary of the errors
err_handler_map = {}
for err in unsupported_errors:
err_handler_map.setdefault(err.msg, []).append(err.handler.RH_NAME)
reason_str = ', '.join([f'{msg} ({", ".join(handlers)})' for msg, handlers in err_handler_map.items()])
if unexpected_errors:
reason_str = ' + '.join(filter(None, [reason_str, f'{len(unexpected_errors)} unexpected error(s)']))
err_str = 'Unable to handle request'
if reason_str:
err_str += f': {reason_str}'
super().__init__(msg=err_str)
class TransportError(RequestError):
"""Network related errors"""
class HTTPError(RequestError):
def __init__(self, response: Response, redirect_loop=False):
self.response = response
self.status = response.status
self.reason = response.reason
self.redirect_loop = redirect_loop
msg = f'HTTP Error {response.status}: {response.reason}'
if redirect_loop:
msg += ' (redirect loop detected)'
super().__init__(msg=msg)
def close(self):
self.response.close()
def __repr__(self):
return f'<HTTPError {self.status}: {self.reason}>'
class IncompleteRead(TransportError):
def __init__(self, partial: int, expected: int | None = None, **kwargs):
self.partial = partial
self.expected = expected
msg = f'{partial} bytes read'
if expected is not None:
msg += f', {expected} more expected'
super().__init__(msg=msg, **kwargs)
def __repr__(self):
return f'<IncompleteRead: {self.msg}>'
class SSLError(TransportError):
pass
class CertificateVerifyError(SSLError):
"""Raised when certificate validated has failed"""
pass
class ProxyError(TransportError):
pass
network_exceptions = (HTTPError, TransportError)

View File

@@ -0,0 +1,155 @@
from __future__ import annotations
import re
from abc import ABC
from dataclasses import dataclass
from types import NoneType
from typing import Any
from .common import RequestHandler, register_preference, Request
from .exceptions import UnsupportedRequest
from ..utils import classproperty, join_nonempty
from ..utils.networking import std_headers, HTTPHeaderDict
@dataclass(order=True, frozen=True)
class ImpersonateTarget:
"""
A target for browser impersonation.
Parameters:
@param client: the client to impersonate
@param version: the client version to impersonate
@param os: the client OS to impersonate
@param os_version: the client OS version to impersonate
Note: None is used to indicate to match any.
"""
client: str | None = None
version: str | None = None
os: str | None = None
os_version: str | None = None
def __post_init__(self):
if self.version and not self.client:
raise ValueError('client is required if version is set')
if self.os_version and not self.os:
raise ValueError('os is required if os_version is set')
def __contains__(self, target: ImpersonateTarget):
if not isinstance(target, ImpersonateTarget):
return False
return (
(self.client is None or target.client is None or self.client == target.client)
and (self.version is None or target.version is None or self.version == target.version)
and (self.os is None or target.os is None or self.os == target.os)
and (self.os_version is None or target.os_version is None or self.os_version == target.os_version)
)
def __str__(self):
return f'{join_nonempty(self.client, self.version)}:{join_nonempty(self.os, self.os_version)}'.rstrip(':')
@classmethod
def from_str(cls, target: str):
mobj = re.fullmatch(r'(?:(?P<client>[^:-]+)(?:-(?P<version>[^:-]+))?)?(?::(?:(?P<os>[^:-]+)(?:-(?P<os_version>[^:-]+))?)?)?', target)
if not mobj:
raise ValueError(f'Invalid impersonate target "{target}"')
return cls(**mobj.groupdict())
class ImpersonateRequestHandler(RequestHandler, ABC):
"""
Base class for request handlers that support browser impersonation.
This provides a method for checking the validity of the impersonate extension,
which can be used in _check_extensions.
Impersonate targets consist of a client, version, os and os_ver.
See the ImpersonateTarget class for more details.
The following may be defined:
- `_SUPPORTED_IMPERSONATE_TARGET_MAP`: a dict mapping supported targets to custom object.
Any Request with an impersonate target not in this list will raise an UnsupportedRequest.
Set to None to disable this check.
Note: Entries are in order of preference
Parameters:
@param impersonate: the default impersonate target to use for requests.
Set to None to disable impersonation.
"""
_SUPPORTED_IMPERSONATE_TARGET_MAP: dict[ImpersonateTarget, Any] = {}
def __init__(self, *, impersonate: ImpersonateTarget = None, **kwargs):
super().__init__(**kwargs)
self.impersonate = impersonate
def _check_impersonate_target(self, target: ImpersonateTarget):
assert isinstance(target, (ImpersonateTarget, NoneType))
if target is None or not self.supported_targets:
return
if not self.is_supported_target(target):
raise UnsupportedRequest(f'Unsupported impersonate target: {target}')
def _check_extensions(self, extensions):
super()._check_extensions(extensions)
if 'impersonate' in extensions:
self._check_impersonate_target(extensions.get('impersonate'))
def _validate(self, request):
super()._validate(request)
self._check_impersonate_target(self.impersonate)
def _resolve_target(self, target: ImpersonateTarget | None):
"""Resolve a target to a supported target."""
if target is None:
return
for supported_target in self.supported_targets:
if target in supported_target:
if self.verbose:
self._logger.stdout(
f'{self.RH_NAME}: resolved impersonate target {target} to {supported_target}')
return supported_target
@classproperty
def supported_targets(cls) -> tuple[ImpersonateTarget, ...]:
return tuple(cls._SUPPORTED_IMPERSONATE_TARGET_MAP.keys())
def is_supported_target(self, target: ImpersonateTarget):
assert isinstance(target, ImpersonateTarget)
return self._resolve_target(target) is not None
def _get_request_target(self, request):
"""Get the requested target for the request"""
return self._resolve_target(request.extensions.get('impersonate') or self.impersonate)
def _prepare_impersonate_headers(self, request: Request, headers: HTTPHeaderDict) -> None: # noqa: B027
"""Additional operations to prepare headers before building. To be extended by subclasses.
@param request: Request object
@param headers: Merged headers to prepare
"""
def _get_impersonate_headers(self, request: Request) -> dict[str, str]:
"""
Get headers for external impersonation use.
Subclasses may define a _prepare_impersonate_headers method to modify headers after merge but before building.
"""
headers = self._merge_headers(request.headers)
if self._get_request_target(request) is not None:
# remove all headers present in std_headers
# TODO: change this to not depend on std_headers
for k, v in std_headers.items():
if headers.get(k) == v:
headers.pop(k)
self._prepare_impersonate_headers(request, headers)
if request.extensions.get('keep_header_casing'):
return headers.sensitive()
return dict(headers)
@register_preference(ImpersonateRequestHandler)
def impersonate_preference(rh, request):
if request.extensions.get('impersonate') or rh.impersonate:
return 1000
return 0

View File

@@ -0,0 +1,23 @@
from __future__ import annotations
import abc
from .common import RequestHandler, Response
class WebSocketResponse(Response):
def send(self, message: bytes | str):
"""
Send a message to the server.
@param message: The message to send. A string (str) is sent as a text frame, bytes is sent as a binary frame.
"""
raise NotImplementedError
def recv(self):
raise NotImplementedError
class WebSocketRequestHandler(RequestHandler, abc.ABC):
pass