python-scraping
python-scraping copied to clipboard
chapter3 question
hi,little sister, in chapter3,Collect all the list of ExternalLinks,The code in the book is wrong with the splitAddress function times:
即将获取链接的URL是:/
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-2-4254ccc1a2c3> in <module>()
64 getAllExternalLinks(link)
65
---> 66 getAllExternalLinks("http://oreilly.com")
<ipython-input-2-4254ccc1a2c3> in getAllExternalLinks(siteUrl)
62 print("即将获取链接的URL是:" + link)
63 allIntLinks.add(link)
---> 64 getAllExternalLinks(link)
65
66 getAllExternalLinks("http://oreilly.com")
<ipython-input-2-4254ccc1a2c3> in getAllExternalLinks(siteUrl)
62 print("即将获取链接的URL是:" + link)
63 allIntLinks.add(link)
---> 64 getAllExternalLinks(link)
65
66 getAllExternalLinks("http://oreilly.com")
<ipython-input-2-4254ccc1a2c3> in getAllExternalLinks(siteUrl)
62 print("即将获取链接的URL是:" + link)
63 allIntLinks.add(link)
---> 64 getAllExternalLinks(link)
65
66 getAllExternalLinks("http://oreilly.com")
<ipython-input-2-4254ccc1a2c3> in getAllExternalLinks(siteUrl)
49 allIntLinks = set()
50 def getAllExternalLinks(siteUrl):
---> 51 html = urlopen(siteUrl)
52 bs = BeautifulSoup(html,"html.parser")
53 internalLinks = getInternalLinks(bs,splitAddress(siteUrl)[0])
~/anaconda3/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 else:
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
225 def install_opener(opener):
~/anaconda3/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
509 # accept a URL or a Request object
510 if isinstance(fullurl, str):
--> 511 req = Request(fullurl, data)
512 else:
513 req = fullurl
~/anaconda3/lib/python3.6/urllib/request.py in __init__(self, url, data, headers, origin_req_host, unverifiable, method)
327 origin_req_host=None, unverifiable=False,
328 method=None):
--> 329 self.full_url = url
330 self.headers = {}
331 self.unredirected_hdrs = {}
~/anaconda3/lib/python3.6/urllib/request.py in full_url(self, url)
353 self._full_url = unwrap(url)
354 self._full_url, self.fragment = splittag(self._full_url)
--> 355 self._parse()
356
357 @full_url.deleter
~/anaconda3/lib/python3.6/urllib/request.py in _parse(self)
382 self.type, rest = splittype(self._full_url)
383 if self.type is None:
--> 384 raise ValueError("unknown url type: %r" % self.full_url)
385 self.host, self.selector = splithost(rest)
386 if self.host:
ValueError: unknown url type: '/'
Using the code you provide on GitHub is wrong:
Traceback (most recent call last):
File "/home/kongnian/PycharmProjects/Scraping/getAllExternalLinks.py", line 81, in <module>
getAllExternalLinks("http://oreilly.com")
File "/home/kongnian/PycharmProjects/Scraping/getAllExternalLinks.py", line 76, in getAllExternalLinks
getAllExternalLinks(link)
File "/home/kongnian/PycharmProjects/Scraping/getAllExternalLinks.py", line 76, in getAllExternalLinks
getAllExternalLinks(link)
File "/home/kongnian/PycharmProjects/Scraping/getAllExternalLinks.py", line 76, in getAllExternalLinks
getAllExternalLinks(link)
[Previous line repeated 15 more times]
File "/home/kongnian/PycharmProjects/Scraping/getAllExternalLinks.py", line 63, in getAllExternalLinks
html = urlopen(siteUrl)
File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 532, in open
response = meth(req, response)
File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 564, in error
result = self._call_chain(*args)
File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 756, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 532, in open
response = meth(req, response)
File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 570, in error
return self._call_chain(*args)
File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
Is this the end of the collection?
thanks!
please elaborate i can work
not able to understand fully