juriscraper add rate_limit base method

Create a method that can be over written to follow the web page usage policies displayed in robots.txt, preventing captchas and page blocks due to abuse to this policies.

1461

Jul 15 '25 16:07 Luis-manzur

@Luis-manzur

this is great for the sample_caller - but to make this truly work we will need to force CL to abide by the rate limits set out in a particular court right?

I think we need to create a replacement for this code in CL


def get_binary_content(
    download_url: str,
    site: AbstractSite,
) -> bytes | str:
    """Downloads the file, covering a few special cases such as invalid SSL
    certificates and empty file errors.

    :param download_url: The URL for the item you wish to download.
    :param site: Site object used to download data

    :return: The downloaded and cleaned content
    :raises: NoDownloadUrlError, UnexpectedContentTypeError, EmptyFileError
    """
    if not download_url:
        raise NoDownloadUrlError(download_url)

    # noinspection PyBroadException
    if site.method == "LOCAL":
        # "LOCAL" is the method when testing
        url = os.path.join(settings.MEDIA_ROOT, download_url)
        mr = MockRequest(url=url)
        r = mr.get()
        s = requests.Session()
    else:
        # some sites require a custom ssl_context, contained in the Site's
        # session. However, we can't send a request with both a
        # custom ssl_context and `verify = False`
        has_cipher = hasattr(site, "cipher")
        s = site.request["session"] if has_cipher else requests.session()

        if site.needs_special_headers:
            headers = site.request["headers"]
        else:
            headers = {"User-Agent": "CourtListener"}

        # Note that we do a GET even if site.method is POST. This is
        # deliberate.
        r = s.get(
            download_url,
            verify=has_cipher,  # WA has a certificate we don't understand
            headers=headers,
            cookies=site.cookies,
            timeout=300,
        )

        # test for empty files (thank you CA1)
        if len(r.content) == 0:
            raise EmptyFileError(f"EmptyFileError: '{download_url}'")

        # test for expected content type (thanks mont for nil)
        if site.expected_content_types:
            # Clean up content types like "application/pdf;charset=utf-8"
            # and 'application/octet-stream; charset=UTF-8'
            content_type = (
                r.headers.get("Content-Type").lower().split(";")[0].strip()
            )
            m = any(
                content_type in mime.lower()
                for mime in site.expected_content_types
            )

            if not m:
                court_str = site.court_id.split(".")[-1].split("_")[0]
                fingerprint = [f"{court_str}-unexpected-content-type"]
                msg = f"'{download_url}' '{content_type}' not in {site.expected_content_types}"
                raise UnexpectedContentTypeError(msg, fingerprint=fingerprint)

        # test for and follow meta redirects
        r = follow_redirections(r, s)
        r.raise_for_status()

    content = site.cleanup_content(r.content)

    return content

Jul 16 '25 15:07 flooie

I think we need to create a replacement for this code in CL

I agree and I commented here

Jul 16 '25 20:07 Luis-manzur