Learning_Python icon indicating copy to clipboard operation
Learning_Python copied to clipboard

Implement link checker

Open rsokl opened this issue 6 years ago • 0 comments

Ensure that links are not broken and that internal links use https and not http. Using the example provided by hypothesis

from hypothesis.stateful import GenericStateMachine
import hypothesis.strategies as st
from requests_html import HTMLSession


class LinkChecker(GenericStateMachine):
    def __init__(self):
        super(LinkChecker, self).__init__()
        self.session = HTMLSession()
        self.result = None

    def steps(self):
        if self.result is None:
            # Always start on the home page
            return st.just("https://hypothesis.works/")
        else:
            return st.sampled_from([
                l
                for l in self.result.html.absolute_links
                # Don't try to crawl to other people's sites
                if l.startswith("https://hypothesis.works") and
                # Avoid Cloudflare's bot protection. We are a bot but we don't
                # care about the info it's hiding.
                '/cdn-cgi/' not in l
            ])

    def execute_step(self, step):
        self.result = self.session.get(step)

        assert self.result.status_code == 200

        for l in self.result.html.absolute_links:
            # All links should be HTTPS
            assert "http://hypothesis.works" not in l


TestLinks = LinkChecker.TestCase

rsokl avatar Nov 14 '18 13:11 rsokl