python 模拟TLS指纹 爬取站点

33次阅读
没有评论

playwright

from playwright.sync_api import sync_playwright
url=''

def fetch_with_playwright(url):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)  # 调试阶段建议关闭无头模式 
        context = browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
            viewport={"width": 1920, "height": 1080},  # 模拟真实分辨率
            locale="zh-CN",  # 设置为中文
            timezone_id="Asia/Shanghai",  # 模拟中国时区
        )

        # 打开新页面
        page = context.new_page()

        # 访问目标页面
        page.goto(url, wait_until="load")

        # 滚动页面
        # page.evaluate("window.scrollBy(0, document.body.scrollHeight)") 

        # 提取页面内容
        content = page.content()
        print(content)

        # 关闭浏览器
        browser.close()

# 调用函数
fetch_with_playwright(url)

requests (tls_client替代)

import tls_client
url=''
# 创建一个 session,指定伪造浏览器行为
session = tls_client.Session(
    client_identifier="chrome_110",  # 指定模拟 Chrome 浏览器版本
    ja3_string="771,4866-4867-4865-49196-49195-49188-49187-49162-49161-159-158-107-103-57-56-52393-52392-255,0-11-10-35-13-5-23-65281-43-45-51-21,29-23-24,0"
)

# 添加浏览器常见的请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language": "zh-CN,zh;q=0.9"
}

# 更新请求头
session.headers.update(headers)

# 发起请求
response = session.get(url)

# 打印返回内容
print(f"状态码: {response.status_code}")
print(f"页面内容: {response.text[:500]}")  # 打印前 500 字符,避免输出过多内容

httpx

import httpx
import ssl
url=''
# 自定义 SSL 配置
ssl_context = ssl.create_default_context()
ssl_context.set_ciphers('ECDHE+AESGCM')

# 创建客户端实例并发起请求
with httpx.Client(verify=ssl_context) as client:
    response = client.get(url)
    print(f"状态码: {response.status_code}")
    print(f"页面内容: {response.text[:500]}")  # 打印前500个字符
正文完
 0
评论(没有评论)