Updated script that can be controled by Nodejs web app

This commit is contained in:
mac OS
2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions

View File

@ -0,0 +1,28 @@
# SPDX-FileCopyrightText: 2015 Eric Larson
#
# SPDX-License-Identifier: Apache-2.0
"""CacheControl import Interface.
Make it easy to import from cachecontrol without long namespaces.
"""
__author__ = "Eric Larson"
__email__ = "eric@ionrock.org"
__version__ = "0.14.0"
from pip._vendor.cachecontrol.adapter import CacheControlAdapter
from pip._vendor.cachecontrol.controller import CacheController
from pip._vendor.cachecontrol.wrapper import CacheControl
__all__ = [
"__author__",
"__email__",
"__version__",
"CacheControlAdapter",
"CacheController",
"CacheControl",
]
import logging
logging.getLogger(__name__).addHandler(logging.NullHandler())

View File

@ -0,0 +1,70 @@
# SPDX-FileCopyrightText: 2015 Eric Larson
#
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import logging
from argparse import ArgumentParser
from typing import TYPE_CHECKING
from pip._vendor import requests
from pip._vendor.cachecontrol.adapter import CacheControlAdapter
from pip._vendor.cachecontrol.cache import DictCache
from pip._vendor.cachecontrol.controller import logger
if TYPE_CHECKING:
from argparse import Namespace
from pip._vendor.cachecontrol.controller import CacheController
def setup_logging() -> None:
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler()
logger.addHandler(handler)
def get_session() -> requests.Session:
adapter = CacheControlAdapter(
DictCache(), cache_etags=True, serializer=None, heuristic=None
)
sess = requests.Session()
sess.mount("http://", adapter)
sess.mount("https://", adapter)
sess.cache_controller = adapter.controller # type: ignore[attr-defined]
return sess
def get_args() -> Namespace:
parser = ArgumentParser()
parser.add_argument("url", help="The URL to try and cache")
return parser.parse_args()
def main() -> None:
args = get_args()
sess = get_session()
# Make a request to get a response
resp = sess.get(args.url)
# Turn on logging
setup_logging()
# try setting the cache
cache_controller: CacheController = (
sess.cache_controller # type: ignore[attr-defined]
)
cache_controller.cache_response(resp.request, resp.raw)
# Now try to get it
if cache_controller.cached_request(resp.request):
print("Cached!")
else:
print("Not cached :(")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,161 @@
# SPDX-FileCopyrightText: 2015 Eric Larson
#
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import functools
import types
import zlib
from typing import TYPE_CHECKING, Any, Collection, Mapping
from pip._vendor.requests.adapters import HTTPAdapter
from pip._vendor.cachecontrol.cache import DictCache
from pip._vendor.cachecontrol.controller import PERMANENT_REDIRECT_STATUSES, CacheController
from pip._vendor.cachecontrol.filewrapper import CallbackFileWrapper
if TYPE_CHECKING:
from pip._vendor.requests import PreparedRequest, Response
from pip._vendor.urllib3 import HTTPResponse
from pip._vendor.cachecontrol.cache import BaseCache
from pip._vendor.cachecontrol.heuristics import BaseHeuristic
from pip._vendor.cachecontrol.serialize import Serializer
class CacheControlAdapter(HTTPAdapter):
invalidating_methods = {"PUT", "PATCH", "DELETE"}
def __init__(
self,
cache: BaseCache | None = None,
cache_etags: bool = True,
controller_class: type[CacheController] | None = None,
serializer: Serializer | None = None,
heuristic: BaseHeuristic | None = None,
cacheable_methods: Collection[str] | None = None,
*args: Any,
**kw: Any,
) -> None:
super().__init__(*args, **kw)
self.cache = DictCache() if cache is None else cache
self.heuristic = heuristic
self.cacheable_methods = cacheable_methods or ("GET",)
controller_factory = controller_class or CacheController
self.controller = controller_factory(
self.cache, cache_etags=cache_etags, serializer=serializer
)
def send(
self,
request: PreparedRequest,
stream: bool = False,
timeout: None | float | tuple[float, float] | tuple[float, None] = None,
verify: bool | str = True,
cert: (None | bytes | str | tuple[bytes | str, bytes | str]) = None,
proxies: Mapping[str, str] | None = None,
cacheable_methods: Collection[str] | None = None,
) -> Response:
"""
Send a request. Use the request information to see if it
exists in the cache and cache the response if we need to and can.
"""
cacheable = cacheable_methods or self.cacheable_methods
if request.method in cacheable:
try:
cached_response = self.controller.cached_request(request)
except zlib.error:
cached_response = None
if cached_response:
return self.build_response(request, cached_response, from_cache=True)
# check for etags and add headers if appropriate
request.headers.update(self.controller.conditional_headers(request))
resp = super().send(request, stream, timeout, verify, cert, proxies)
return resp
def build_response(
self,
request: PreparedRequest,
response: HTTPResponse,
from_cache: bool = False,
cacheable_methods: Collection[str] | None = None,
) -> Response:
"""
Build a response by making a request or using the cache.
This will end up calling send and returning a potentially
cached response
"""
cacheable = cacheable_methods or self.cacheable_methods
if not from_cache and request.method in cacheable:
# Check for any heuristics that might update headers
# before trying to cache.
if self.heuristic:
response = self.heuristic.apply(response)
# apply any expiration heuristics
if response.status == 304:
# We must have sent an ETag request. This could mean
# that we've been expired already or that we simply
# have an etag. In either case, we want to try and
# update the cache if that is the case.
cached_response = self.controller.update_cached_response(
request, response
)
if cached_response is not response:
from_cache = True
# We are done with the server response, read a
# possible response body (compliant servers will
# not return one, but we cannot be 100% sure) and
# release the connection back to the pool.
response.read(decode_content=False)
response.release_conn()
response = cached_response
# We always cache the 301 responses
elif int(response.status) in PERMANENT_REDIRECT_STATUSES:
self.controller.cache_response(request, response)
else:
# Wrap the response file with a wrapper that will cache the
# response when the stream has been consumed.
response._fp = CallbackFileWrapper( # type: ignore[assignment]
response._fp, # type: ignore[arg-type]
functools.partial(
self.controller.cache_response, request, response
),
)
if response.chunked:
super_update_chunk_length = response._update_chunk_length
def _update_chunk_length(self: HTTPResponse) -> None:
super_update_chunk_length()
if self.chunk_left == 0:
self._fp._close() # type: ignore[union-attr]
response._update_chunk_length = types.MethodType( # type: ignore[method-assign]
_update_chunk_length, response
)
resp: Response = super().build_response(request, response) # type: ignore[no-untyped-call]
# See if we should invalidate the cache.
if request.method in self.invalidating_methods and resp.ok:
assert request.url is not None
cache_url = self.controller.cache_url(request.url)
self.cache.delete(cache_url)
# Give the request a from_cache attr to let people use it
resp.from_cache = from_cache # type: ignore[attr-defined]
return resp
def close(self) -> None:
self.cache.close()
super().close() # type: ignore[no-untyped-call]

View File

@ -0,0 +1,74 @@
# SPDX-FileCopyrightText: 2015 Eric Larson
#
# SPDX-License-Identifier: Apache-2.0
"""
The cache object API for implementing caches. The default is a thread
safe in-memory dictionary.
"""
from __future__ import annotations
from threading import Lock
from typing import IO, TYPE_CHECKING, MutableMapping
if TYPE_CHECKING:
from datetime import datetime
class BaseCache:
def get(self, key: str) -> bytes | None:
raise NotImplementedError()
def set(
self, key: str, value: bytes, expires: int | datetime | None = None
) -> None:
raise NotImplementedError()
def delete(self, key: str) -> None:
raise NotImplementedError()
def close(self) -> None:
pass
class DictCache(BaseCache):
def __init__(self, init_dict: MutableMapping[str, bytes] | None = None) -> None:
self.lock = Lock()
self.data = init_dict or {}
def get(self, key: str) -> bytes | None:
return self.data.get(key, None)
def set(
self, key: str, value: bytes, expires: int | datetime | None = None
) -> None:
with self.lock:
self.data.update({key: value})
def delete(self, key: str) -> None:
with self.lock:
if key in self.data:
self.data.pop(key)
class SeparateBodyBaseCache(BaseCache):
"""
In this variant, the body is not stored mixed in with the metadata, but is
passed in (as a bytes-like object) in a separate call to ``set_body()``.
That is, the expected interaction pattern is::
cache.set(key, serialized_metadata)
cache.set_body(key)
Similarly, the body should be loaded separately via ``get_body()``.
"""
def set_body(self, key: str, body: bytes) -> None:
raise NotImplementedError()
def get_body(self, key: str) -> IO[bytes] | None:
"""
Return the body as file-like object.
"""
raise NotImplementedError()

View File

@ -0,0 +1,8 @@
# SPDX-FileCopyrightText: 2015 Eric Larson
#
# SPDX-License-Identifier: Apache-2.0
from pip._vendor.cachecontrol.caches.file_cache import FileCache, SeparateBodyFileCache
from pip._vendor.cachecontrol.caches.redis_cache import RedisCache
__all__ = ["FileCache", "SeparateBodyFileCache", "RedisCache"]

View File

@ -0,0 +1,182 @@
# SPDX-FileCopyrightText: 2015 Eric Larson
#
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import hashlib
import os
from textwrap import dedent
from typing import IO, TYPE_CHECKING, Union
from pathlib import Path
from pip._vendor.cachecontrol.cache import BaseCache, SeparateBodyBaseCache
from pip._vendor.cachecontrol.controller import CacheController
if TYPE_CHECKING:
from datetime import datetime
from filelock import BaseFileLock
def _secure_open_write(filename: str, fmode: int) -> IO[bytes]:
# We only want to write to this file, so open it in write only mode
flags = os.O_WRONLY
# os.O_CREAT | os.O_EXCL will fail if the file already exists, so we only
# will open *new* files.
# We specify this because we want to ensure that the mode we pass is the
# mode of the file.
flags |= os.O_CREAT | os.O_EXCL
# Do not follow symlinks to prevent someone from making a symlink that
# we follow and insecurely open a cache file.
if hasattr(os, "O_NOFOLLOW"):
flags |= os.O_NOFOLLOW
# On Windows we'll mark this file as binary
if hasattr(os, "O_BINARY"):
flags |= os.O_BINARY
# Before we open our file, we want to delete any existing file that is
# there
try:
os.remove(filename)
except OSError:
# The file must not exist already, so we can just skip ahead to opening
pass
# Open our file, the use of os.O_CREAT | os.O_EXCL will ensure that if a
# race condition happens between the os.remove and this line, that an
# error will be raised. Because we utilize a lockfile this should only
# happen if someone is attempting to attack us.
fd = os.open(filename, flags, fmode)
try:
return os.fdopen(fd, "wb")
except:
# An error occurred wrapping our FD in a file object
os.close(fd)
raise
class _FileCacheMixin:
"""Shared implementation for both FileCache variants."""
def __init__(
self,
directory: str | Path,
forever: bool = False,
filemode: int = 0o0600,
dirmode: int = 0o0700,
lock_class: type[BaseFileLock] | None = None,
) -> None:
try:
if lock_class is None:
from filelock import FileLock
lock_class = FileLock
except ImportError:
notice = dedent(
"""
NOTE: In order to use the FileCache you must have
filelock installed. You can install it via pip:
pip install cachecontrol[filecache]
"""
)
raise ImportError(notice)
self.directory = directory
self.forever = forever
self.filemode = filemode
self.dirmode = dirmode
self.lock_class = lock_class
@staticmethod
def encode(x: str) -> str:
return hashlib.sha224(x.encode()).hexdigest()
def _fn(self, name: str) -> str:
# NOTE: This method should not change as some may depend on it.
# See: https://github.com/ionrock/cachecontrol/issues/63
hashed = self.encode(name)
parts = list(hashed[:5]) + [hashed]
return os.path.join(self.directory, *parts)
def get(self, key: str) -> bytes | None:
name = self._fn(key)
try:
with open(name, "rb") as fh:
return fh.read()
except FileNotFoundError:
return None
def set(
self, key: str, value: bytes, expires: int | datetime | None = None
) -> None:
name = self._fn(key)
self._write(name, value)
def _write(self, path: str, data: bytes) -> None:
"""
Safely write the data to the given path.
"""
# Make sure the directory exists
try:
os.makedirs(os.path.dirname(path), self.dirmode)
except OSError:
pass
with self.lock_class(path + ".lock"):
# Write our actual file
with _secure_open_write(path, self.filemode) as fh:
fh.write(data)
def _delete(self, key: str, suffix: str) -> None:
name = self._fn(key) + suffix
if not self.forever:
try:
os.remove(name)
except FileNotFoundError:
pass
class FileCache(_FileCacheMixin, BaseCache):
"""
Traditional FileCache: body is stored in memory, so not suitable for large
downloads.
"""
def delete(self, key: str) -> None:
self._delete(key, "")
class SeparateBodyFileCache(_FileCacheMixin, SeparateBodyBaseCache):
"""
Memory-efficient FileCache: body is stored in a separate file, reducing
peak memory usage.
"""
def get_body(self, key: str) -> IO[bytes] | None:
name = self._fn(key) + ".body"
try:
return open(name, "rb")
except FileNotFoundError:
return None
def set_body(self, key: str, body: bytes) -> None:
name = self._fn(key) + ".body"
self._write(name, body)
def delete(self, key: str) -> None:
self._delete(key, "")
self._delete(key, ".body")
def url_to_file_path(url: str, filecache: FileCache) -> str:
"""Return the file cache path based on the URL.
This does not ensure the file exists!
"""
key = CacheController.cache_url(url)
return filecache._fn(key)

View File

@ -0,0 +1,48 @@
# SPDX-FileCopyrightText: 2015 Eric Larson
#
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
from datetime import datetime, timezone
from typing import TYPE_CHECKING
from pip._vendor.cachecontrol.cache import BaseCache
if TYPE_CHECKING:
from redis import Redis
class RedisCache(BaseCache):
def __init__(self, conn: Redis[bytes]) -> None:
self.conn = conn
def get(self, key: str) -> bytes | None:
return self.conn.get(key)
def set(
self, key: str, value: bytes, expires: int | datetime | None = None
) -> None:
if not expires:
self.conn.set(key, value)
elif isinstance(expires, datetime):
now_utc = datetime.now(timezone.utc)
if expires.tzinfo is None:
now_utc = now_utc.replace(tzinfo=None)
delta = expires - now_utc
self.conn.setex(key, int(delta.total_seconds()), value)
else:
self.conn.setex(key, expires, value)
def delete(self, key: str) -> None:
self.conn.delete(key)
def clear(self) -> None:
"""Helper for clearing all the keys in a database. Use with
caution!"""
for key in self.conn.keys():
self.conn.delete(key)
def close(self) -> None:
"""Redis uses connection pooling, no need to close the connection."""
pass

View File

@ -0,0 +1,499 @@
# SPDX-FileCopyrightText: 2015 Eric Larson
#
# SPDX-License-Identifier: Apache-2.0
"""
The httplib2 algorithms ported for use with requests.
"""
from __future__ import annotations
import calendar
import logging
import re
import time
from email.utils import parsedate_tz
from typing import TYPE_CHECKING, Collection, Mapping
from pip._vendor.requests.structures import CaseInsensitiveDict
from pip._vendor.cachecontrol.cache import DictCache, SeparateBodyBaseCache
from pip._vendor.cachecontrol.serialize import Serializer
if TYPE_CHECKING:
from typing import Literal
from pip._vendor.requests import PreparedRequest
from pip._vendor.urllib3 import HTTPResponse
from pip._vendor.cachecontrol.cache import BaseCache
logger = logging.getLogger(__name__)
URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
PERMANENT_REDIRECT_STATUSES = (301, 308)
def parse_uri(uri: str) -> tuple[str, str, str, str, str]:
"""Parses a URI using the regex given in Appendix B of RFC 3986.
(scheme, authority, path, query, fragment) = parse_uri(uri)
"""
match = URI.match(uri)
assert match is not None
groups = match.groups()
return (groups[1], groups[3], groups[4], groups[6], groups[8])
class CacheController:
"""An interface to see if request should cached or not."""
def __init__(
self,
cache: BaseCache | None = None,
cache_etags: bool = True,
serializer: Serializer | None = None,
status_codes: Collection[int] | None = None,
):
self.cache = DictCache() if cache is None else cache
self.cache_etags = cache_etags
self.serializer = serializer or Serializer()
self.cacheable_status_codes = status_codes or (200, 203, 300, 301, 308)
@classmethod
def _urlnorm(cls, uri: str) -> str:
"""Normalize the URL to create a safe key for the cache"""
(scheme, authority, path, query, fragment) = parse_uri(uri)
if not scheme or not authority:
raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
scheme = scheme.lower()
authority = authority.lower()
if not path:
path = "/"
# Could do syntax based normalization of the URI before
# computing the digest. See Section 6.2.2 of Std 66.
request_uri = query and "?".join([path, query]) or path
defrag_uri = scheme + "://" + authority + request_uri
return defrag_uri
@classmethod
def cache_url(cls, uri: str) -> str:
return cls._urlnorm(uri)
def parse_cache_control(self, headers: Mapping[str, str]) -> dict[str, int | None]:
known_directives = {
# https://tools.ietf.org/html/rfc7234#section-5.2
"max-age": (int, True),
"max-stale": (int, False),
"min-fresh": (int, True),
"no-cache": (None, False),
"no-store": (None, False),
"no-transform": (None, False),
"only-if-cached": (None, False),
"must-revalidate": (None, False),
"public": (None, False),
"private": (None, False),
"proxy-revalidate": (None, False),
"s-maxage": (int, True),
}
cc_headers = headers.get("cache-control", headers.get("Cache-Control", ""))
retval: dict[str, int | None] = {}
for cc_directive in cc_headers.split(","):
if not cc_directive.strip():
continue
parts = cc_directive.split("=", 1)
directive = parts[0].strip()
try:
typ, required = known_directives[directive]
except KeyError:
logger.debug("Ignoring unknown cache-control directive: %s", directive)
continue
if not typ or not required:
retval[directive] = None
if typ:
try:
retval[directive] = typ(parts[1].strip())
except IndexError:
if required:
logger.debug(
"Missing value for cache-control " "directive: %s",
directive,
)
except ValueError:
logger.debug(
"Invalid value for cache-control directive " "%s, must be %s",
directive,
typ.__name__,
)
return retval
def _load_from_cache(self, request: PreparedRequest) -> HTTPResponse | None:
"""
Load a cached response, or return None if it's not available.
"""
# We do not support caching of partial content: so if the request contains a
# Range header then we don't want to load anything from the cache.
if "Range" in request.headers:
return None
cache_url = request.url
assert cache_url is not None
cache_data = self.cache.get(cache_url)
if cache_data is None:
logger.debug("No cache entry available")
return None
if isinstance(self.cache, SeparateBodyBaseCache):
body_file = self.cache.get_body(cache_url)
else:
body_file = None
result = self.serializer.loads(request, cache_data, body_file)
if result is None:
logger.warning("Cache entry deserialization failed, entry ignored")
return result
def cached_request(self, request: PreparedRequest) -> HTTPResponse | Literal[False]:
"""
Return a cached response if it exists in the cache, otherwise
return False.
"""
assert request.url is not None
cache_url = self.cache_url(request.url)
logger.debug('Looking up "%s" in the cache', cache_url)
cc = self.parse_cache_control(request.headers)
# Bail out if the request insists on fresh data
if "no-cache" in cc:
logger.debug('Request header has "no-cache", cache bypassed')
return False
if "max-age" in cc and cc["max-age"] == 0:
logger.debug('Request header has "max_age" as 0, cache bypassed')
return False
# Check whether we can load the response from the cache:
resp = self._load_from_cache(request)
if not resp:
return False
# If we have a cached permanent redirect, return it immediately. We
# don't need to test our response for other headers b/c it is
# intrinsically "cacheable" as it is Permanent.
#
# See:
# https://tools.ietf.org/html/rfc7231#section-6.4.2
#
# Client can try to refresh the value by repeating the request
# with cache busting headers as usual (ie no-cache).
if int(resp.status) in PERMANENT_REDIRECT_STATUSES:
msg = (
"Returning cached permanent redirect response "
"(ignoring date and etag information)"
)
logger.debug(msg)
return resp
headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(resp.headers)
if not headers or "date" not in headers:
if "etag" not in headers:
# Without date or etag, the cached response can never be used
# and should be deleted.
logger.debug("Purging cached response: no date or etag")
self.cache.delete(cache_url)
logger.debug("Ignoring cached response: no date")
return False
now = time.time()
time_tuple = parsedate_tz(headers["date"])
assert time_tuple is not None
date = calendar.timegm(time_tuple[:6])
current_age = max(0, now - date)
logger.debug("Current age based on date: %i", current_age)
# TODO: There is an assumption that the result will be a
# urllib3 response object. This may not be best since we
# could probably avoid instantiating or constructing the
# response until we know we need it.
resp_cc = self.parse_cache_control(headers)
# determine freshness
freshness_lifetime = 0
# Check the max-age pragma in the cache control header
max_age = resp_cc.get("max-age")
if max_age is not None:
freshness_lifetime = max_age
logger.debug("Freshness lifetime from max-age: %i", freshness_lifetime)
# If there isn't a max-age, check for an expires header
elif "expires" in headers:
expires = parsedate_tz(headers["expires"])
if expires is not None:
expire_time = calendar.timegm(expires[:6]) - date
freshness_lifetime = max(0, expire_time)
logger.debug("Freshness lifetime from expires: %i", freshness_lifetime)
# Determine if we are setting freshness limit in the
# request. Note, this overrides what was in the response.
max_age = cc.get("max-age")
if max_age is not None:
freshness_lifetime = max_age
logger.debug(
"Freshness lifetime from request max-age: %i", freshness_lifetime
)
min_fresh = cc.get("min-fresh")
if min_fresh is not None:
# adjust our current age by our min fresh
current_age += min_fresh
logger.debug("Adjusted current age from min-fresh: %i", current_age)
# Return entry if it is fresh enough
if freshness_lifetime > current_age:
logger.debug('The response is "fresh", returning cached response')
logger.debug("%i > %i", freshness_lifetime, current_age)
return resp
# we're not fresh. If we don't have an Etag, clear it out
if "etag" not in headers:
logger.debug('The cached response is "stale" with no etag, purging')
self.cache.delete(cache_url)
# return the original handler
return False
def conditional_headers(self, request: PreparedRequest) -> dict[str, str]:
resp = self._load_from_cache(request)
new_headers = {}
if resp:
headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(resp.headers)
if "etag" in headers:
new_headers["If-None-Match"] = headers["ETag"]
if "last-modified" in headers:
new_headers["If-Modified-Since"] = headers["Last-Modified"]
return new_headers
def _cache_set(
self,
cache_url: str,
request: PreparedRequest,
response: HTTPResponse,
body: bytes | None = None,
expires_time: int | None = None,
) -> None:
"""
Store the data in the cache.
"""
if isinstance(self.cache, SeparateBodyBaseCache):
# We pass in the body separately; just put a placeholder empty
# string in the metadata.
self.cache.set(
cache_url,
self.serializer.dumps(request, response, b""),
expires=expires_time,
)
# body is None can happen when, for example, we're only updating
# headers, as is the case in update_cached_response().
if body is not None:
self.cache.set_body(cache_url, body)
else:
self.cache.set(
cache_url,
self.serializer.dumps(request, response, body),
expires=expires_time,
)
def cache_response(
self,
request: PreparedRequest,
response: HTTPResponse,
body: bytes | None = None,
status_codes: Collection[int] | None = None,
) -> None:
"""
Algorithm for caching requests.
This assumes a requests Response object.
"""
# From httplib2: Don't cache 206's since we aren't going to
# handle byte range requests
cacheable_status_codes = status_codes or self.cacheable_status_codes
if response.status not in cacheable_status_codes:
logger.debug(
"Status code %s not in %s", response.status, cacheable_status_codes
)
return
response_headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(
response.headers
)
if "date" in response_headers:
time_tuple = parsedate_tz(response_headers["date"])
assert time_tuple is not None
date = calendar.timegm(time_tuple[:6])
else:
date = 0
# If we've been given a body, our response has a Content-Length, that
# Content-Length is valid then we can check to see if the body we've
# been given matches the expected size, and if it doesn't we'll just
# skip trying to cache it.
if (
body is not None
and "content-length" in response_headers
and response_headers["content-length"].isdigit()
and int(response_headers["content-length"]) != len(body)
):
return
cc_req = self.parse_cache_control(request.headers)
cc = self.parse_cache_control(response_headers)
assert request.url is not None
cache_url = self.cache_url(request.url)
logger.debug('Updating cache with response from "%s"', cache_url)
# Delete it from the cache if we happen to have it stored there
no_store = False
if "no-store" in cc:
no_store = True
logger.debug('Response header has "no-store"')
if "no-store" in cc_req:
no_store = True
logger.debug('Request header has "no-store"')
if no_store and self.cache.get(cache_url):
logger.debug('Purging existing cache entry to honor "no-store"')
self.cache.delete(cache_url)
if no_store:
return
# https://tools.ietf.org/html/rfc7234#section-4.1:
# A Vary header field-value of "*" always fails to match.
# Storing such a response leads to a deserialization warning
# during cache lookup and is not allowed to ever be served,
# so storing it can be avoided.
if "*" in response_headers.get("vary", ""):
logger.debug('Response header has "Vary: *"')
return
# If we've been given an etag, then keep the response
if self.cache_etags and "etag" in response_headers:
expires_time = 0
if response_headers.get("expires"):
expires = parsedate_tz(response_headers["expires"])
if expires is not None:
expires_time = calendar.timegm(expires[:6]) - date
expires_time = max(expires_time, 14 * 86400)
logger.debug(f"etag object cached for {expires_time} seconds")
logger.debug("Caching due to etag")
self._cache_set(cache_url, request, response, body, expires_time)
# Add to the cache any permanent redirects. We do this before looking
# that the Date headers.
elif int(response.status) in PERMANENT_REDIRECT_STATUSES:
logger.debug("Caching permanent redirect")
self._cache_set(cache_url, request, response, b"")
# Add to the cache if the response headers demand it. If there
# is no date header then we can't do anything about expiring
# the cache.
elif "date" in response_headers:
time_tuple = parsedate_tz(response_headers["date"])
assert time_tuple is not None
date = calendar.timegm(time_tuple[:6])
# cache when there is a max-age > 0
max_age = cc.get("max-age")
if max_age is not None and max_age > 0:
logger.debug("Caching b/c date exists and max-age > 0")
expires_time = max_age
self._cache_set(
cache_url,
request,
response,
body,
expires_time,
)
# If the request can expire, it means we should cache it
# in the meantime.
elif "expires" in response_headers:
if response_headers["expires"]:
expires = parsedate_tz(response_headers["expires"])
if expires is not None:
expires_time = calendar.timegm(expires[:6]) - date
else:
expires_time = None
logger.debug(
"Caching b/c of expires header. expires in {} seconds".format(
expires_time
)
)
self._cache_set(
cache_url,
request,
response,
body,
expires_time,
)
def update_cached_response(
self, request: PreparedRequest, response: HTTPResponse
) -> HTTPResponse:
"""On a 304 we will get a new set of headers that we want to
update our cached value with, assuming we have one.
This should only ever be called when we've sent an ETag and
gotten a 304 as the response.
"""
assert request.url is not None
cache_url = self.cache_url(request.url)
cached_response = self._load_from_cache(request)
if not cached_response:
# we didn't have a cached response
return response
# Lets update our headers with the headers from the new request:
# http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-26#section-4.1
#
# The server isn't supposed to send headers that would make
# the cached body invalid. But... just in case, we'll be sure
# to strip out ones we know that might be problmatic due to
# typical assumptions.
excluded_headers = ["content-length"]
cached_response.headers.update(
{
k: v
for k, v in response.headers.items()
if k.lower() not in excluded_headers
}
)
# we want a 200 b/c we have content via the cache
cached_response.status = 200
# update our cache
self._cache_set(cache_url, request, cached_response)
return cached_response

View File

@ -0,0 +1,119 @@
# SPDX-FileCopyrightText: 2015 Eric Larson
#
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import mmap
from tempfile import NamedTemporaryFile
from typing import TYPE_CHECKING, Any, Callable
if TYPE_CHECKING:
from http.client import HTTPResponse
class CallbackFileWrapper:
"""
Small wrapper around a fp object which will tee everything read into a
buffer, and when that file is closed it will execute a callback with the
contents of that buffer.
All attributes are proxied to the underlying file object.
This class uses members with a double underscore (__) leading prefix so as
not to accidentally shadow an attribute.
The data is stored in a temporary file until it is all available. As long
as the temporary files directory is disk-based (sometimes it's a
memory-backed-``tmpfs`` on Linux), data will be unloaded to disk if memory
pressure is high. For small files the disk usually won't be used at all,
it'll all be in the filesystem memory cache, so there should be no
performance impact.
"""
def __init__(
self, fp: HTTPResponse, callback: Callable[[bytes], None] | None
) -> None:
self.__buf = NamedTemporaryFile("rb+", delete=True)
self.__fp = fp
self.__callback = callback
def __getattr__(self, name: str) -> Any:
# The vaguaries of garbage collection means that self.__fp is
# not always set. By using __getattribute__ and the private
# name[0] allows looking up the attribute value and raising an
# AttributeError when it doesn't exist. This stop thigns from
# infinitely recursing calls to getattr in the case where
# self.__fp hasn't been set.
#
# [0] https://docs.python.org/2/reference/expressions.html#atom-identifiers
fp = self.__getattribute__("_CallbackFileWrapper__fp")
return getattr(fp, name)
def __is_fp_closed(self) -> bool:
try:
return self.__fp.fp is None
except AttributeError:
pass
try:
closed: bool = self.__fp.closed
return closed
except AttributeError:
pass
# We just don't cache it then.
# TODO: Add some logging here...
return False
def _close(self) -> None:
if self.__callback:
if self.__buf.tell() == 0:
# Empty file:
result = b""
else:
# Return the data without actually loading it into memory,
# relying on Python's buffer API and mmap(). mmap() just gives
# a view directly into the filesystem's memory cache, so it
# doesn't result in duplicate memory use.
self.__buf.seek(0, 0)
result = memoryview(
mmap.mmap(self.__buf.fileno(), 0, access=mmap.ACCESS_READ)
)
self.__callback(result)
# We assign this to None here, because otherwise we can get into
# really tricky problems where the CPython interpreter dead locks
# because the callback is holding a reference to something which
# has a __del__ method. Setting this to None breaks the cycle
# and allows the garbage collector to do it's thing normally.
self.__callback = None
# Closing the temporary file releases memory and frees disk space.
# Important when caching big files.
self.__buf.close()
def read(self, amt: int | None = None) -> bytes:
data: bytes = self.__fp.read(amt)
if data:
# We may be dealing with b'', a sign that things are over:
# it's passed e.g. after we've already closed self.__buf.
self.__buf.write(data)
if self.__is_fp_closed():
self._close()
return data
def _safe_read(self, amt: int) -> bytes:
data: bytes = self.__fp._safe_read(amt) # type: ignore[attr-defined]
if amt == 2 and data == b"\r\n":
# urllib executes this read to toss the CRLF at the end
# of the chunk.
return data
self.__buf.write(data)
if self.__is_fp_closed():
self._close()
return data

View File

@ -0,0 +1,154 @@
# SPDX-FileCopyrightText: 2015 Eric Larson
#
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import calendar
import time
from datetime import datetime, timedelta, timezone
from email.utils import formatdate, parsedate, parsedate_tz
from typing import TYPE_CHECKING, Any, Mapping
if TYPE_CHECKING:
from pip._vendor.urllib3 import HTTPResponse
TIME_FMT = "%a, %d %b %Y %H:%M:%S GMT"
def expire_after(delta: timedelta, date: datetime | None = None) -> datetime:
date = date or datetime.now(timezone.utc)
return date + delta
def datetime_to_header(dt: datetime) -> str:
return formatdate(calendar.timegm(dt.timetuple()))
class BaseHeuristic:
def warning(self, response: HTTPResponse) -> str | None:
"""
Return a valid 1xx warning header value describing the cache
adjustments.
The response is provided too allow warnings like 113
http://tools.ietf.org/html/rfc7234#section-5.5.4 where we need
to explicitly say response is over 24 hours old.
"""
return '110 - "Response is Stale"'
def update_headers(self, response: HTTPResponse) -> dict[str, str]:
"""Update the response headers with any new headers.
NOTE: This SHOULD always include some Warning header to
signify that the response was cached by the client, not
by way of the provided headers.
"""
return {}
def apply(self, response: HTTPResponse) -> HTTPResponse:
updated_headers = self.update_headers(response)
if updated_headers:
response.headers.update(updated_headers)
warning_header_value = self.warning(response)
if warning_header_value is not None:
response.headers.update({"Warning": warning_header_value})
return response
class OneDayCache(BaseHeuristic):
"""
Cache the response by providing an expires 1 day in the
future.
"""
def update_headers(self, response: HTTPResponse) -> dict[str, str]:
headers = {}
if "expires" not in response.headers:
date = parsedate(response.headers["date"])
expires = expire_after(timedelta(days=1), date=datetime(*date[:6], tzinfo=timezone.utc)) # type: ignore[index,misc]
headers["expires"] = datetime_to_header(expires)
headers["cache-control"] = "public"
return headers
class ExpiresAfter(BaseHeuristic):
"""
Cache **all** requests for a defined time period.
"""
def __init__(self, **kw: Any) -> None:
self.delta = timedelta(**kw)
def update_headers(self, response: HTTPResponse) -> dict[str, str]:
expires = expire_after(self.delta)
return {"expires": datetime_to_header(expires), "cache-control": "public"}
def warning(self, response: HTTPResponse) -> str | None:
tmpl = "110 - Automatically cached for %s. Response might be stale"
return tmpl % self.delta
class LastModified(BaseHeuristic):
"""
If there is no Expires header already, fall back on Last-Modified
using the heuristic from
http://tools.ietf.org/html/rfc7234#section-4.2.2
to calculate a reasonable value.
Firefox also does something like this per
https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching_FAQ
http://lxr.mozilla.org/mozilla-release/source/netwerk/protocol/http/nsHttpResponseHead.cpp#397
Unlike mozilla we limit this to 24-hr.
"""
cacheable_by_default_statuses = {
200,
203,
204,
206,
300,
301,
404,
405,
410,
414,
501,
}
def update_headers(self, resp: HTTPResponse) -> dict[str, str]:
headers: Mapping[str, str] = resp.headers
if "expires" in headers:
return {}
if "cache-control" in headers and headers["cache-control"] != "public":
return {}
if resp.status not in self.cacheable_by_default_statuses:
return {}
if "date" not in headers or "last-modified" not in headers:
return {}
time_tuple = parsedate_tz(headers["date"])
assert time_tuple is not None
date = calendar.timegm(time_tuple[:6])
last_modified = parsedate(headers["last-modified"])
if last_modified is None:
return {}
now = time.time()
current_age = max(0, now - date)
delta = date - calendar.timegm(last_modified)
freshness_lifetime = max(0, min(delta / 10, 24 * 3600))
if freshness_lifetime <= current_age:
return {}
expires = date + freshness_lifetime
return {"expires": time.strftime(TIME_FMT, time.gmtime(expires))}
def warning(self, resp: HTTPResponse) -> str | None:
return None

View File

@ -0,0 +1,146 @@
# SPDX-FileCopyrightText: 2015 Eric Larson
#
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import io
from typing import IO, TYPE_CHECKING, Any, Mapping, cast
from pip._vendor import msgpack
from pip._vendor.requests.structures import CaseInsensitiveDict
from pip._vendor.urllib3 import HTTPResponse
if TYPE_CHECKING:
from pip._vendor.requests import PreparedRequest
class Serializer:
serde_version = "4"
def dumps(
self,
request: PreparedRequest,
response: HTTPResponse,
body: bytes | None = None,
) -> bytes:
response_headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(
response.headers
)
if body is None:
# When a body isn't passed in, we'll read the response. We
# also update the response with a new file handler to be
# sure it acts as though it was never read.
body = response.read(decode_content=False)
response._fp = io.BytesIO(body) # type: ignore[assignment]
response.length_remaining = len(body)
data = {
"response": {
"body": body, # Empty bytestring if body is stored separately
"headers": {str(k): str(v) for k, v in response.headers.items()},
"status": response.status,
"version": response.version,
"reason": str(response.reason),
"decode_content": response.decode_content,
}
}
# Construct our vary headers
data["vary"] = {}
if "vary" in response_headers:
varied_headers = response_headers["vary"].split(",")
for header in varied_headers:
header = str(header).strip()
header_value = request.headers.get(header, None)
if header_value is not None:
header_value = str(header_value)
data["vary"][header] = header_value
return b",".join([f"cc={self.serde_version}".encode(), self.serialize(data)])
def serialize(self, data: dict[str, Any]) -> bytes:
return cast(bytes, msgpack.dumps(data, use_bin_type=True))
def loads(
self,
request: PreparedRequest,
data: bytes,
body_file: IO[bytes] | None = None,
) -> HTTPResponse | None:
# Short circuit if we've been given an empty set of data
if not data:
return None
# Previous versions of this library supported other serialization
# formats, but these have all been removed.
if not data.startswith(f"cc={self.serde_version},".encode()):
return None
data = data[5:]
return self._loads_v4(request, data, body_file)
def prepare_response(
self,
request: PreparedRequest,
cached: Mapping[str, Any],
body_file: IO[bytes] | None = None,
) -> HTTPResponse | None:
"""Verify our vary headers match and construct a real urllib3
HTTPResponse object.
"""
# Special case the '*' Vary value as it means we cannot actually
# determine if the cached response is suitable for this request.
# This case is also handled in the controller code when creating
# a cache entry, but is left here for backwards compatibility.
if "*" in cached.get("vary", {}):
return None
# Ensure that the Vary headers for the cached response match our
# request
for header, value in cached.get("vary", {}).items():
if request.headers.get(header, None) != value:
return None
body_raw = cached["response"].pop("body")
headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(
data=cached["response"]["headers"]
)
if headers.get("transfer-encoding", "") == "chunked":
headers.pop("transfer-encoding")
cached["response"]["headers"] = headers
try:
body: IO[bytes]
if body_file is None:
body = io.BytesIO(body_raw)
else:
body = body_file
except TypeError:
# This can happen if cachecontrol serialized to v1 format (pickle)
# using Python 2. A Python 2 str(byte string) will be unpickled as
# a Python 3 str (unicode string), which will cause the above to
# fail with:
#
# TypeError: 'str' does not support the buffer interface
body = io.BytesIO(body_raw.encode("utf8"))
# Discard any `strict` parameter serialized by older version of cachecontrol.
cached["response"].pop("strict", None)
return HTTPResponse(body=body, preload_content=False, **cached["response"])
def _loads_v4(
self,
request: PreparedRequest,
data: bytes,
body_file: IO[bytes] | None = None,
) -> HTTPResponse | None:
try:
cached = msgpack.loads(data, raw=False)
except ValueError:
return None
return self.prepare_response(request, cached, body_file)

View File

@ -0,0 +1,43 @@
# SPDX-FileCopyrightText: 2015 Eric Larson
#
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
from typing import TYPE_CHECKING, Collection
from pip._vendor.cachecontrol.adapter import CacheControlAdapter
from pip._vendor.cachecontrol.cache import DictCache
if TYPE_CHECKING:
from pip._vendor import requests
from pip._vendor.cachecontrol.cache import BaseCache
from pip._vendor.cachecontrol.controller import CacheController
from pip._vendor.cachecontrol.heuristics import BaseHeuristic
from pip._vendor.cachecontrol.serialize import Serializer
def CacheControl(
sess: requests.Session,
cache: BaseCache | None = None,
cache_etags: bool = True,
serializer: Serializer | None = None,
heuristic: BaseHeuristic | None = None,
controller_class: type[CacheController] | None = None,
adapter_class: type[CacheControlAdapter] | None = None,
cacheable_methods: Collection[str] | None = None,
) -> requests.Session:
cache = DictCache() if cache is None else cache
adapter_class = adapter_class or CacheControlAdapter
adapter = adapter_class(
cache,
cache_etags=cache_etags,
serializer=serializer,
heuristic=heuristic,
controller_class=controller_class,
cacheable_methods=cacheable_methods,
)
sess.mount("http://", adapter)
sess.mount("https://", adapter)
return sess