weibo_terminater icon indicating copy to clipboard operation
weibo_terminater copied to clipboard

建议添加ip代理

Open ray0807 opened this issue 7 years ago • 0 comments

由于存在微博账号登录在不同环境下表现不同的问题(#47),建议添加ip代理, 以下代码仅供参考:

import re
import requests
import pymysql
import time
import random

class SpiderProxy(object):
    def __init__(self):
        self.req = requests.Session()
        self.headers = {
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Referer': 'http://www.ip181.com/',
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
                          '(KHTML, like Gecko) Ubuntu Chromium/60.0.3112.113 Chrome/60.0.3112.113 Safari/537.36',
        }
        self.proxyHeaders = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
                          '(KHTML, like Gecko) Ubuntu Chromium/60.0.3112.113 Chrome/60.0.3112.113 Safari/537.36',
        }
        self.con = pymysql.Connect(
            host='127.0.0.1',
            user='root',
            password="password",
            database='xici',
            port=3306,
            charset='utf8',
        )
        self.cur = self.con.cursor()

    def getPage(self, url):
        content = self.req.get(url, headers=self.headers).text
        return content

    def Page(self, text):
        time.sleep(2)
        # pattern = re.compile(u'<tr class=".*?">.*?'
        #                      + u'<td class="country"><img.*?/></td>.*?'
        #                      + u'<td>(\d+\.\d+\.\d+\.\d+)</td>.*?'
        #                      + u'<td>(\d+)</td>.*?'
        #                      + u'<td>.*?'
        #                      + u'<a href=".*?">(.*?)</a>.*?'
        #                      + u'</td>.*?'
        #                      + u'<td>([A-Z]+)</td>.*?'
        #                      + '</tr>'
        #                      , re.S)
        pattern = re.compile(u'<td>(\d+\.\d+\.\d+\.\d+)</td>.*?'
                             + u'<td>(\d+)</td>.*?'
                             + u'<td>.*?</td>.*?'
                             + u'<td>([A-Z]+)</td>.*?'
                             + u'<td>.*?</td>.*?'
                             + u'<td>.*?</td>.*?'
                             , re.S)
        l = re.findall(pattern, text)
        return l
    def getUrl(self):
        url = 'http://www.ip181.com/'
        return url

    def insert(self, l):
        print("插入{}条".format(len(l)))
        self.cur.executemany("insert into xc values(%s,%s,%s)", l)
        self.con.commit()

    def select(self):
        a = self.cur.execute("select ip,port,protocol from xc")
        info = self.cur.fetchall()
        return info

    def getAccessIP(self):
        content = self.getPage(self.getUrl())
        proxys = self.Page(content)
        p = {}
        for i in proxys:

            try:
                # p.setdefault("{}".format(i[2]).lower(), "{}://{}:{}".format(i[2], i[0], i[1]).lower())
                # self.req.proxies = p
                r = self.req.get("http://ip.taobao.com/service/getIpInfo.php?ip=myip",
                                 proxies={"{}".format(i[2]).lower(): "{}://{}:{}".format(i[2], i[0], i[1]).lower()},
                                 timeout=5)

                print("原始ip:", "xxx.xxx.xxx.xxx  获取到的代理ip:", r.json()['ip'])
                if len(p) == 1:
                    return p
            except Exception as e:
                ##todo 删除无用ip
                print("{} is valid".format(i))
                print(e)

    def getNewipToMysql(self):
        content = self.getPage(self.getUrl())
        proxys = self.Page(content)

if __name__ == '__main__':
    p = SpiderProxy()
    p.getAccessIP()

获取可用代理后可以直接,在get请求中设置代理,亲测有效,由于修改了很多源码,所以暂时不提交requests

ray0807 avatar Jan 09 '18 09:01 ray0807