Closing tag </span> not found 209:76
Hello author @dcollien , how can I solve the following problem: When I try to filter HTML content, I always get an error: tag /.... not found. The code is as follows:
import pandas as pd import FilterHTML import requests from bs4 import BeautifulSoup
读取CSV文件
df = pd.read_csv('xxxx.csv')
attributes = df.columns.tolist() html_text = df.iloc[0]['crawl_html'] URL = df.iloc[0]['url']
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' }
response = requests.get(
URL,
headers=headers,
timeout=10,
# proxies={"http": None, "https": None}
)
response.raise_for_status()
html_content = response.text
soup = BeautifulSoup(html_content, 'html5lib') fixed_html = str(soup)
try:
whitelist = {
'a': {
'href': 'url',
'target': [
'_blank',
'_self'
],
'class': [
'button'
]
},
'img': {
'src': 'url',
'width': 'measurement',
'height': 'measurement'
},
'span': {
'style': {
'color': 'color',
'background-color': 'color'
}
}
}
filtered_html = FilterHTML.filter_html(fixed_html, spec=whitelist)
print(filtered_html)
except Exception as e: raise Exception(f"下载网页失败: {e}")