python-scraping icon indicating copy to clipboard operation
python-scraping copied to clipboard

chapter3 question

Open liaoran opened this issue 7 years ago • 2 comments

hi,little sister, in chapter3,Collect all the list of ExternalLinks,The code in the book is wrong with the splitAddress function times:

即将获取链接的URL是:/

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-2-4254ccc1a2c3> in <module>()
     64             getAllExternalLinks(link)
     65 
---> 66 getAllExternalLinks("http://oreilly.com")

<ipython-input-2-4254ccc1a2c3> in getAllExternalLinks(siteUrl)
     62             print("即将获取链接的URL是:" + link)
     63             allIntLinks.add(link)
---> 64             getAllExternalLinks(link)
     65 
     66 getAllExternalLinks("http://oreilly.com")

<ipython-input-2-4254ccc1a2c3> in getAllExternalLinks(siteUrl)
     62             print("即将获取链接的URL是:" + link)
     63             allIntLinks.add(link)
---> 64             getAllExternalLinks(link)
     65 
     66 getAllExternalLinks("http://oreilly.com")

<ipython-input-2-4254ccc1a2c3> in getAllExternalLinks(siteUrl)
     62             print("即将获取链接的URL是:" + link)
     63             allIntLinks.add(link)
---> 64             getAllExternalLinks(link)
     65 
     66 getAllExternalLinks("http://oreilly.com")

<ipython-input-2-4254ccc1a2c3> in getAllExternalLinks(siteUrl)
     49 allIntLinks = set()
     50 def getAllExternalLinks(siteUrl):
---> 51     html = urlopen(siteUrl)
     52     bs = BeautifulSoup(html,"html.parser")
     53     internalLinks = getInternalLinks(bs,splitAddress(siteUrl)[0])

~/anaconda3/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    221     else:
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 
    225 def install_opener(opener):

~/anaconda3/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
    509         # accept a URL or a Request object
    510         if isinstance(fullurl, str):
--> 511             req = Request(fullurl, data)
    512         else:
    513             req = fullurl

~/anaconda3/lib/python3.6/urllib/request.py in __init__(self, url, data, headers, origin_req_host, unverifiable, method)
    327                  origin_req_host=None, unverifiable=False,
    328                  method=None):
--> 329         self.full_url = url
    330         self.headers = {}
    331         self.unredirected_hdrs = {}

~/anaconda3/lib/python3.6/urllib/request.py in full_url(self, url)
    353         self._full_url = unwrap(url)
    354         self._full_url, self.fragment = splittag(self._full_url)
--> 355         self._parse()
    356 
    357     @full_url.deleter

~/anaconda3/lib/python3.6/urllib/request.py in _parse(self)
    382         self.type, rest = splittype(self._full_url)
    383         if self.type is None:
--> 384             raise ValueError("unknown url type: %r" % self.full_url)
    385         self.host, self.selector = splithost(rest)
    386         if self.host:

ValueError: unknown url type: '/'

Using the code you provide on GitHub is wrong:

Traceback (most recent call last):
  File "/home/kongnian/PycharmProjects/Scraping/getAllExternalLinks.py", line 81, in <module>
    getAllExternalLinks("http://oreilly.com")
  File "/home/kongnian/PycharmProjects/Scraping/getAllExternalLinks.py", line 76, in getAllExternalLinks
    getAllExternalLinks(link)
  File "/home/kongnian/PycharmProjects/Scraping/getAllExternalLinks.py", line 76, in getAllExternalLinks
    getAllExternalLinks(link)
  File "/home/kongnian/PycharmProjects/Scraping/getAllExternalLinks.py", line 76, in getAllExternalLinks
    getAllExternalLinks(link)
  [Previous line repeated 15 more times]
  File "/home/kongnian/PycharmProjects/Scraping/getAllExternalLinks.py", line 63, in getAllExternalLinks
    html = urlopen(siteUrl)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 223, in urlopen
    return opener.open(url, data, timeout)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 532, in open
    response = meth(req, response)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 642, in http_response
    'http', request, response, code, msg, hdrs)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 564, in error
    result = self._call_chain(*args)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 504, in _call_chain
    result = func(*args)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 756, in http_error_302
    return self.parent.open(new, timeout=req.timeout)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 532, in open
    response = meth(req, response)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 642, in http_response
    'http', request, response, code, msg, hdrs)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 570, in error
    return self._call_chain(*args)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 504, in _call_chain
    result = func(*args)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 650, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found

Is this the end of the collection?

thanks!

liaoran avatar Jan 06 '18 09:01 liaoran

please elaborate i can work

arpit7714 avatar Feb 03 '18 16:02 arpit7714

not able to understand fully

arpit7714 avatar Feb 03 '18 16:02 arpit7714