0x1 前言

Chrome 最近推出Headless版本,所谓的无头模式,身为爬虫迷,为什么不体验一波,最终发现 当真正运用到实际的上的时候碰到N多坑,后来慢慢解决发现,一个最大的坑无疑是Headless 开启代理 Https 无法正常解析,也就是说“--ignore-certificate-errors”在Headless 无法忽略Https 错误,导致 程序获取不到我们想要的数据。直接上谷歌 一顿搜索 发现很多国际有人也碰到这种蛋疼的问题,“https://bugs.chromium.org/p/chromium/issues/detail?id=721739#c95” 可能要科学上网才能访问。里面详细描述 为什么Chrome Headless 不支持 错误的ssl,仔细看你会发现,原来是官方的锅。。。。。

0x2 实现

# _*_ coding:utf‐8 _*
import hashlib
import random
import time
from selenium import webdriver
USER_AGENTS = [
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
my_app_key = "123"
app_secret = "123"
daili_url = 's5.proxy.mayidaili.com'
daili_port = '8123'
PROXY = "http://" + daili_url + ":" + daili_port

def proxy_info():  # 生成代理密钥
    timesp = '{}'.format(time.strftime("%Y-%m-%d %H:%M:%S"))
    codes = app_secret + 'app_key' + my_app_key + 'timestamp' + timesp + app_secret
    sign = hashlib.md5(codes.encode('utf-8')).hexdigest().upper()
    authHeader = 'MYH-AUTH-MD5 sign=' + sign + '&app_key=' + my_app_key + '&timestamp=' + timesp
    return authHeader
if __name__ == '__main__':
    # 配置项目
    # Create a copy of desired capabilities object.
    # 在windows系统:chrome driver 默认使用的是IE代理设置。而例如Firefox可以自行配置proxy
    desired_capabilities = webdriver.DesiredCapabilities.SAFARI.copy()
    desired_capabilities['acceptSslCerts'] = True  # 忽略ssl 错误
    desired_capabilities['acceptInsecureCerts'] = True # 忽略ssl 错误
    # Change the proxy properties of that copy.
    desired_capabilities['proxy'] = {
    "httpProxy": PROXY,
    "ftpProxy": PROXY,
    "sslProxy": PROXY,
    "noProxy": None,
    "proxyType": "MANUAL",
    "class": "org.openqa.selenium.Proxy",
    "autodetect": False
    }
    # 创建的新实例驱动
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument("--ignore-ssl-errors")
    options.add_argument("--proxy-bypass-list")
    options.add_argument("--no-sandbox")
    options.add_argument("--allow-insecure-localhost")
    options.add_argument("--allow-running-insecure-content")
    options.add_argument("--ignore-certificate-errors")
    options.add_argument('user-agent=||{}||{}||'.format(random.choice(USER_AGENTS), proxy_info()))
    # options.add_argument('window-size=1200x600')
    driver = webdriver.Chrome(chrome_options=options, desired_capabilities=desired_capabilities)
    # 尝试访问登陆页面
    for neti in range(0, 3):
        SUCCESS = True
        try:
            driver.get('https://myip.ipip.net/')
            driver.implicitly_wait(3)  # wait seconds 等待页面加载
        except Exception as e:
            SUCCESS = False
            print(e)
            continue
        if SUCCESS:
            break
    print(driver.page_source)
    print("--finish--")
    driver.quit()
    exit(0)