适用情况:大部分网站均限制了IP的访问量
对于“频繁点击”的情况,我们还可以通过限制爬虫访问网站的频率来避免被网站禁掉。
#! -*- encoding:utf-8 -*- import requests import random # 要访问的目标页面 targetUrl = "http://httpbin.org/ip" # 要访问的目标HTTPS页面 # targetUrl = "https://httpbin.org/ip" # 代理服务器(产品官网 www.16yun.cn) proxyHost = "t.16yun.cn" proxyPort = "31111" # 代理隧道验证信息 proxyUser = "username" proxyPass = "password" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host" : proxyHost, "port" : proxyPort, "user" : proxyUser, "pass" : proxyPass, } # 设置 http和https访问都是用HTTP代理 proxies = { "http" : proxyMeta, "https" : proxyMeta, } # 设置IP切换头 tunnel = random.randint(1,10000) headers = {"Proxy-Tunnel": str(tunnel)} resp = requests.get(targetUrl, proxies=proxies, headers=headers) print resp.status_code print resp.text(2)mitmproxy过滤
其次针对navigator.webdriver 通过mitmproxy做中间人代理将对应的屏蔽代码注入到原网站中从而达到规避检测目的
注入屏蔽代码
TARGET_URL = 'https://g.alicdn.com/secdev/sufei_data/3.6.8/index.js'INJECT_TEXT = 'Object.defineProperties(navigator,{webdriver:{get:() => false}});'def response(flow): if flow.request.url.startswith(TARGET_URL): flow.response.text = INJECT_TEXT + flow.response.text print('注入成功') if 'um.js' in flow.request.url or '115.js' in flow.request.url: # 屏蔽selenium检测 flow.response.text = flow.response.text + INJECT_TEXT
在运行前启动代理
mitmdump -s httpProxy.py -p 9000代码例子
# -*- coding:UTF-8 -*-import timeimport refrom datetime import date, timedeltafrom selenium import webdriverfrom selenium.common.exceptions import NoSuchElementExceptionfrom selenium.webdriver import ActionChainsfrom selenium.webdriver.chrome.options import OptionsTB_LOGIN_URL = 'https://login.taobao.com/member/login.jhtml'CHROME_DRIVER = '/usr/local/bin/chromedriver' # Windows和Mac的配置路径不一样class SessionException(Exception): """ 会话异常类 """ def __init__(self, message): super().__init__(self) self.message = message def __str__(self): return self.messageclass Crawler: def __init__(self): self.browser = None def start(self, username, password): print("初始化浏览器") self.__init_browser() print("切换至密码输入框") self.__switch_to_password_mode() time.sleep(0.5) print("输入用户名") self.__write_username(username) time.sleep(2.5) print("输入密码") self.__write_password(password) time.sleep(3.5) print("程序模拟解锁") if self.__lock_exist(): self.__unlock() print("开始发起登录请求") self.__submit() time.sleep(4.5) # 登录成功,直接请求页面 print("登录成功,跳转至目标页面") self.__navigate_to_target_page() time.sleep(6.5) print("解析页面文本") crawler_list = self.__parse_page_content(); # 连接数据库并保存数据 print("保存数据到mysql数据库") self.__save_list_to_db(crawler_list) def __switch_to_password_mode(self): """ 切换到密码模式 :return: """ if self.browser.find_element_by_id('J_QRCodeLogin').is_displayed(): self.browser.find_element_by_id('J_Quick2Static').click() def __write_username(self, username): """ 输入账号 :param username: :return: """ username_input_element = self.browser.find_element_by_id('TPL_username_1') username_input_element.clear() username_input_element.send_keys(username) def __write_password(self, password): """ 输入密码 :param password: :return: """ password_input_element = self.browser.find_element_by_id("TPL_password_1") password_input_element.clear() password_input_element.send_keys(password) def __lock_exist(self): """ 判断是否存在滑动验证 :return: """ return self.__is_element_exist('#nc_1_wrapper') and self.browser.find_element_by_id( 'nc_1_wrapper').is_displayed() def __unlock(self): """ 执行滑动解锁 :return: """ bar_element = self.browser.find_element_by_id('nc_1_n1z') ActionChains(self.browser).drag_and_drop_by_offset(bar_element, 800, 0).perform() time.sleep(1.5) self.browser.get_screenshot_as_file('error.png') if self.__is_element_exist('.errloading > span'): error_message_element = self.browser.find_element_by_css_selector('.errloading > span') error_message = error_message_element.text self.browser.execute_script('noCaptcha.reset(1)') raise SessionException('滑动验证失败, message = ' + error_message) def __submit(self): """ 提交登录 :return: """ self.browser.find_element_by_id('J_SubmitStatic').click() time.sleep(0.5) if self.__is_element_exist("#J_Message"): error_message_element = self.browser.find_element_by_css_selector('#J_Message > p') error_message = error_message_element.text raise SessionException('登录出错, message = ' + error_message) #跳转至目标页面 def __navigate_to_target_page(self): pass # 解析网页数据 def __parse_page_content(self): pass #保存数据 def __save_list_to_db(self, crawler_list): pass def __init_browser(self): """ 初始化selenium浏览器 :return: """ options = Options() # options.add_argument("--headless") prefs = {"profile.managed_default_content_settings.images": 1} options.add_experimental_option("prefs", prefs) options.add_argument('--proxy-server=http://127.0.0.1:9000') options.add_argument('disable-infobars') options.add_argument('--no-sandbox') self.browser = webdriver.Chrome(executable_path=CHROME_DRIVER, options=options) self.browser.implicitly_wait(3) self.browser.maximize_window() self.browser.get(TB_LOGIN_URL)#执行命令行Crawler().start('username'), 'password'))
这里给出一个比较简易的登录方式,即选择微博登录绕开可能存在滑动验证码的情况
try: chrome_options = webdriver.ChromeOptions() #chrome_options.add_argument('--headless') # 下一行代码是为了以开发者模式打开chrome chrome_options.add_experimental_option('excludeSwitches',['enable-automation']) browser = webdriver.Chrome(options=chrome_options) browser.get("https://s.taobao.com/search?q=iPad") button = browser.find_element_by_class_name('login-switch') button.click() button = browser.find_element_by_class_name('weibo-login') button.click() user_name = browser.find_element_by_name('username') user_name.clear() user_name.send_keys('*****') #输入微博名 需要事先绑定淘宝 time.sleep(1) user_keys = browser.find_element_by_name('password') user_keys.clear() user_keys.send_keys('*****') #输入微博密码 time.sleep(1) button = browser.find_element_by_class_name('W_btn_g') button.click() time.sleep(1) cookies = browser.get_cookies() ses=requests.Session() # 维持登录状态 c = requests.cookies.RequestsCookieJar() for item in cookies: c.set(item["name"],item["value"]) ses.cookies.update(c) ses=requests.Session() time.sleep(1) print('登录成功')except: print("登录失败")2. 伪装成浏览器,或者反“反盗链”
有些网站会检查你是不是真的浏览器访问,还是机器自动访问的。这种情况,加上User-Agent,表明你是浏览器访问即可。有时还会检查是否带Referer信息还会检查你的Referer是否合法,一般再加上Referer。
User-Agent可以用亿牛云提供给的真实库,Referer的来源可以伪装成百度搜索来的。
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.1276.73 Safari/537.36', 'Referer':'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=nike'}response = requests.get(url=url, headers=headers)
爬虫是长期进行的任务,所以需要配合代理才能更好的获取数据,亿牛云爬虫代理加强版可以让我们爬虫程序长期稳定的进行数据采集。