V2EX = way to explore
V2EX 是一个关于分享和探索的地方
Sign Up Now
For Existing Member  Sign In
U87
V2EX  ›  问与答

Python 爬虫 ip 被封,公司给了个付费快代理接口,我先验证付费接口 ip 的可用性,然后拿来爬目标网站,还是出现

  •  
  •   U87 · Jun 19, 2018 · 2730 views
    This topic created in 2874 days ago, the information mentioned may be changed or developed.

    import requests from lxml import etree import time, random from random import choice

    def get_proxy(): url = 'http://svip.kuaidaili.com/api/getproxy/?orderid=&num=1&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=1&method=2&an_an=1&an_ha=1&quality=2&sep=1' proxy_temp = requests.get(url=url, timeout=1).text

    proxy = {'http':'http://{}'.format(proxy_temp)}
    if requests.get(url='http://nj.58.com/chuzu/?key=%E7%A7%9F%E6%88%BF', proxies=proxy).status_code == 200:
        return proxy
    else:
        get_proxy()
    

    def crawl(): frist_url = 'http://nj.58.com/chuzu'

    headers = [{'User-Agent':'Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'}, {'User-Agent':'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'}, {'User-Agent':'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0;'}, {'User-Agent':'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1'}, {'User-Agent':'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11'}, {'User-Agent':'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0)'}, {'User-Agent':'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;360SE)'}, {'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'}]
    
    s = requests.session()
    s.keep_alive = False
    
    try:
        resp = requests.get(url=frist_url, timeout=0.5).text
    
    except requests.exceptions.RequestException as e:
        print(e)
    
    
    
    attr = etree.HTML(resp)
    
    max_page = attr.xpath('//div[@class="pager"]/a/span/text()')[-2]
    
    for page in range(1, int(max_page)+1):
    
        next_url = frist_url + "/pn" + str(page)
    
        response = requests.get(url=next_url, proxies=get_proxy(), timeout=1, headers=random.choice(headers)).text
    
        attr = etree.HTML(response)
    
        detail_urls = attr.xpath('//ul[@class="listUl"]/li/div[@class="img_list"]/a/@href')
    
        for detail_url in detail_urls:
    
            time.sleep(random.random()*3)
    
            try:
                s = requests.session()
                s.keep_alive = False
    
                r = requests.get(url=detail_url, proxies=get_proxy(), timeout=1, headers=random.choice(headers)).text
    
            except requests.exceptions.RequestException as e:
    
                print(e)
    
            html = etree.HTML(r)
    
            if "pinpaigongyu" in detail_url:
                phone = str(html.xpath('//div[@class="phonenum getPrivateCallBtnStyle"]/text()'))
                rent_type = html.xpath('//div[@class="housedetail center cf"]/h2/text()')[0].split()[0].split('] ')[0].split(' [')[1]
                area = html.xpath('//ul[@class="house-info-list"]/li[1]/span/text()')[0].split()[0]+"平"
                room_type = html.xpath('//ul[@class="house-info-list"]/li[2]/span/text()')[0].split()[0]
                addres = html.xpath('//ul[@class="house-info-list"]/li[4]/span/text()')[0].strip()
                traffic = str(html.xpath('//ul[@class="house-info-list"]/li[5]/span/text()'))
                pictures = html.xpath('//ul[@id="pic-list"]/li/img/@lazy_src')
                house_description = html.xpath('//p[@id="desc"]/text()')[0].replace(' ','')
                print(phone)
    
    
    
    
            else:
                phone = str(html.xpath('//div[@class="house-chat-phonenum"]/p[@class="phone-num"]/text()'))
                rent_type = html.xpath('//ul[@class="f14"]/li[1]/span[2]/text()')[0].split('-')[0]
                area = html.xpath('//ul[@class="f14"]/li[2]/span[2]/text()')[0].split()[1]+"平"
                room_type = html.xpath('//ul[@class="f14"]/li[2]/span[2]/text()')[0].split()[0]
                addres = html.xpath('//ul[@class="f14"]/li[6]/span[2]/text()')[0].strip()
                traffic = str(html.xpath('//ul[@class="f14"]/li[5]/em/text()'))
                pictures = html.xpath('//ul[@id="housePicList"]/li/img/@lazy_src')  
                house_description = str(html.xpath('//ul[@class="introduce-item"]/li[2]/span[@class="a2"]//text()')).strip()
                print(phone)
    

    if name == 'main': crawl()

    1 replies    2018-06-19 20:24:34 +08:00
    U87
        1
    U87  
    OP
       Jun 19, 2018
    还是出现 requests.exceptions.ProxyError 难道是在验证和爬目标网站这时间之间 ip 失效了?
    About   ·   Help   ·   Advertise   ·   Blog   ·   API   ·   FAQ   ·   Solana   ·   762 Online   Highest 6679   ·     Select Language
    创意工作者们的社区
    World is powered by solitude
    VERSION: 3.9.8.5 · 29ms · UTC 21:17 · PVG 05:17 · LAX 14:17 · JFK 17:17
    ♥ Do have faith in what you're doing.