weibo_terminater
weibo_terminater copied to clipboard
建议添加ip代理
由于存在微博账号登录在不同环境下表现不同的问题(#47),建议添加ip代理, 以下代码仅供参考:
import re
import requests
import pymysql
import time
import random
class SpiderProxy(object):
def __init__(self):
self.req = requests.Session()
self.headers = {
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Referer': 'http://www.ip181.com/',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Ubuntu Chromium/60.0.3112.113 Chrome/60.0.3112.113 Safari/537.36',
}
self.proxyHeaders = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Ubuntu Chromium/60.0.3112.113 Chrome/60.0.3112.113 Safari/537.36',
}
self.con = pymysql.Connect(
host='127.0.0.1',
user='root',
password="password",
database='xici',
port=3306,
charset='utf8',
)
self.cur = self.con.cursor()
def getPage(self, url):
content = self.req.get(url, headers=self.headers).text
return content
def Page(self, text):
time.sleep(2)
# pattern = re.compile(u'<tr class=".*?">.*?'
# + u'<td class="country"><img.*?/></td>.*?'
# + u'<td>(\d+\.\d+\.\d+\.\d+)</td>.*?'
# + u'<td>(\d+)</td>.*?'
# + u'<td>.*?'
# + u'<a href=".*?">(.*?)</a>.*?'
# + u'</td>.*?'
# + u'<td>([A-Z]+)</td>.*?'
# + '</tr>'
# , re.S)
pattern = re.compile(u'<td>(\d+\.\d+\.\d+\.\d+)</td>.*?'
+ u'<td>(\d+)</td>.*?'
+ u'<td>.*?</td>.*?'
+ u'<td>([A-Z]+)</td>.*?'
+ u'<td>.*?</td>.*?'
+ u'<td>.*?</td>.*?'
, re.S)
l = re.findall(pattern, text)
return l
def getUrl(self):
url = 'http://www.ip181.com/'
return url
def insert(self, l):
print("插入{}条".format(len(l)))
self.cur.executemany("insert into xc values(%s,%s,%s)", l)
self.con.commit()
def select(self):
a = self.cur.execute("select ip,port,protocol from xc")
info = self.cur.fetchall()
return info
def getAccessIP(self):
content = self.getPage(self.getUrl())
proxys = self.Page(content)
p = {}
for i in proxys:
try:
# p.setdefault("{}".format(i[2]).lower(), "{}://{}:{}".format(i[2], i[0], i[1]).lower())
# self.req.proxies = p
r = self.req.get("http://ip.taobao.com/service/getIpInfo.php?ip=myip",
proxies={"{}".format(i[2]).lower(): "{}://{}:{}".format(i[2], i[0], i[1]).lower()},
timeout=5)
print("原始ip:", "xxx.xxx.xxx.xxx 获取到的代理ip:", r.json()['ip'])
if len(p) == 1:
return p
except Exception as e:
##todo 删除无用ip
print("{} is valid".format(i))
print(e)
def getNewipToMysql(self):
content = self.getPage(self.getUrl())
proxys = self.Page(content)
if __name__ == '__main__':
p = SpiderProxy()
p.getAccessIP()
获取可用代理后可以直接,在get请求中设置代理,亲测有效,由于修改了很多源码,所以暂时不提交requests