juriscraper
juriscraper copied to clipboard
add rate_limit base method
Create a method that can be over written to follow the web page usage policies displayed in robots.txt, preventing captchas and page blocks due to abuse to this policies.
@Luis-manzur
this is great for the sample_caller - but to make this truly work we will need to force CL to abide by the rate limits set out in a particular court right?
I think we need to create a replacement for this code in CL
def get_binary_content(
download_url: str,
site: AbstractSite,
) -> bytes | str:
"""Downloads the file, covering a few special cases such as invalid SSL
certificates and empty file errors.
:param download_url: The URL for the item you wish to download.
:param site: Site object used to download data
:return: The downloaded and cleaned content
:raises: NoDownloadUrlError, UnexpectedContentTypeError, EmptyFileError
"""
if not download_url:
raise NoDownloadUrlError(download_url)
# noinspection PyBroadException
if site.method == "LOCAL":
# "LOCAL" is the method when testing
url = os.path.join(settings.MEDIA_ROOT, download_url)
mr = MockRequest(url=url)
r = mr.get()
s = requests.Session()
else:
# some sites require a custom ssl_context, contained in the Site's
# session. However, we can't send a request with both a
# custom ssl_context and `verify = False`
has_cipher = hasattr(site, "cipher")
s = site.request["session"] if has_cipher else requests.session()
if site.needs_special_headers:
headers = site.request["headers"]
else:
headers = {"User-Agent": "CourtListener"}
# Note that we do a GET even if site.method is POST. This is
# deliberate.
r = s.get(
download_url,
verify=has_cipher, # WA has a certificate we don't understand
headers=headers,
cookies=site.cookies,
timeout=300,
)
# test for empty files (thank you CA1)
if len(r.content) == 0:
raise EmptyFileError(f"EmptyFileError: '{download_url}'")
# test for expected content type (thanks mont for nil)
if site.expected_content_types:
# Clean up content types like "application/pdf;charset=utf-8"
# and 'application/octet-stream; charset=UTF-8'
content_type = (
r.headers.get("Content-Type").lower().split(";")[0].strip()
)
m = any(
content_type in mime.lower()
for mime in site.expected_content_types
)
if not m:
court_str = site.court_id.split(".")[-1].split("_")[0]
fingerprint = [f"{court_str}-unexpected-content-type"]
msg = f"'{download_url}' '{content_type}' not in {site.expected_content_types}"
raise UnexpectedContentTypeError(msg, fingerprint=fingerprint)
# test for and follow meta redirects
r = follow_redirections(r, s)
r.raise_for_status()
content = site.cleanup_content(r.content)
return content