Learning_Python
Learning_Python copied to clipboard
Implement link checker
Ensure that links are not broken and that internal links use https and not http. Using the example provided by hypothesis
from hypothesis.stateful import GenericStateMachine
import hypothesis.strategies as st
from requests_html import HTMLSession
class LinkChecker(GenericStateMachine):
def __init__(self):
super(LinkChecker, self).__init__()
self.session = HTMLSession()
self.result = None
def steps(self):
if self.result is None:
# Always start on the home page
return st.just("https://hypothesis.works/")
else:
return st.sampled_from([
l
for l in self.result.html.absolute_links
# Don't try to crawl to other people's sites
if l.startswith("https://hypothesis.works") and
# Avoid Cloudflare's bot protection. We are a bot but we don't
# care about the info it's hiding.
'/cdn-cgi/' not in l
])
def execute_step(self, step):
self.result = self.session.get(step)
assert self.result.status_code == 200
for l in self.result.html.absolute_links:
# All links should be HTTPS
assert "http://hypothesis.works" not in l
TestLinks = LinkChecker.TestCase