kindle-open-books
kindle-open-books copied to clipboard
抓取soup乱码
from calibre.web.feeds.recipes import BasicNewsRecipe
class Python_Tutorial(BasicNewsRecipe):
title = 'Python Tutorial'
description = ''
cover_url = 'http://www.runoob.com/wp-content/uploads/2013/11/python.jpg'
url_prefix = 'http://www.runoob.com'
no_stylesheets = True
# 添加encoding也不行
encoding = 'utf-8'
keep_only_tags = [{ 'class': 'article-intro' }]
def get_title(self, link):
return link.contents[0].strip()
def parse_index(self):
soup = self.index_to_soup(self.url_prefix + '/python/python-tutorial.html')
# 这里打印soup是乱码
print(soup)
div = soup.find('div', { 'id': 'leftcolumn' })
articles = []
for link in div.findAll('a'):
if '#' in link['href']:
continue
if not '/python' in link['href']:
continue
til = self.get_title(link)
url = self.url_prefix + link['href']
a = { 'title': til, 'url': url }
articles.append(a)
ans = [('Python_Tutorial', articles)]
return ans