Files
CFspider/cfspider/api.py
2026-03-27 14:02:57 +08:00

1205 lines
40 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
CFspider 核心 API 模块
提供同步 HTTP 请求功能,支持:
- 通过 Cloudflare Workers VLESS 代理请求
- TLS 指纹模拟 (curl_cffi)
- HTTP/2 支持 (httpx)
- 隐身模式(完整浏览器请求头)
- IP 地图可视化
"""
import requests
import time
from urllib.parse import urlencode, quote, urlparse
from typing import Optional, Any
# 延迟导入 IP 地图模块
from . import ip_map
# 延迟导入 httpx仅在需要 HTTP/2 时使用
_httpx = None
def _get_httpx():
"""延迟加载 httpx 模块"""
global _httpx
if _httpx is None:
try:
import httpx
_httpx = httpx
except ImportError:
raise ImportError(
"httpx is required for HTTP/2 support. "
"Install it with: pip install httpx[http2]"
)
return _httpx
# 延迟导入 curl_cffi仅在需要 TLS 指纹时使用
_curl_cffi = None
def _get_curl_cffi():
"""延迟加载 curl_cffi 模块"""
global _curl_cffi
if _curl_cffi is None:
try:
from curl_cffi import requests as curl_requests
_curl_cffi = curl_requests
except ImportError:
raise ImportError(
"curl_cffi is required for TLS fingerprint impersonation. "
"Install it with: pip install curl_cffi"
)
return _curl_cffi
class CFSpiderResponse:
"""
CFspider 响应对象
封装 HTTP 响应,提供与 requests.Response 兼容的接口,
并额外提供 Cloudflare 特有的信息如节点代码、Ray ID
Attributes:
cf_colo (str): Cloudflare 数据中心代码(如 NRT=东京, SIN=新加坡, LAX=洛杉矶)
使用 Workers 代理时可用,表示请求经过的 CF 节点
cf_ray (str): Cloudflare Ray ID每个请求的唯一标识符
可用于调试和追踪请求
text (str): 响应文本内容(自动解码)
content (bytes): 响应原始字节内容
status_code (int): HTTP 状态码(如 200, 404, 500
headers (dict): 响应头字典
cookies: 响应 Cookie
url (str): 最终请求的 URL跟随重定向后
encoding (str): 响应编码
Methods:
json(**kwargs): 将响应解析为 JSON
raise_for_status(): 当状态码非 2xx 时抛出 HTTPError
Example:
>>> response = cfspider.get("https://httpbin.org/ip", cf_proxies="...", uuid="...")
>>> print(response.status_code) # 200
>>> print(response.cf_colo) # NRT (东京节点)
>>> data = response.json()
>>> print(data['origin']) # Cloudflare IP
"""
def __init__(self, response, cf_colo=None, cf_ray=None):
"""
初始化响应对象
Args:
response: 原始 requests/httpx/curl_cffi 响应对象
cf_colo: Cloudflare 数据中心代码(从响应头获取)
cf_ray: Cloudflare Ray ID从响应头获取
"""
self._response = response
self.cf_colo = cf_colo
self.cf_ray = cf_ray
@property
def text(self) -> str:
"""响应文本内容(自动解码)"""
return self._response.text
@property
def content(self) -> bytes:
"""响应原始字节内容"""
return self._response.content
@property
def status_code(self) -> int:
"""HTTP 状态码"""
return self._response.status_code
@property
def headers(self):
"""响应头字典"""
return self._response.headers
@property
def cookies(self):
"""响应 Cookie"""
return self._response.cookies
@property
def url(self) -> str:
"""最终请求的 URL跟随重定向后"""
return self._response.url
@property
def encoding(self) -> Optional[str]:
"""响应编码"""
return self._response.encoding
@encoding.setter
def encoding(self, value: str):
"""设置响应编码"""
self._response.encoding = value
def json(self, **kwargs) -> Any:
"""
将响应解析为 JSON
Args:
**kwargs: 传递给 json.loads() 的参数
Returns:
解析后的 JSON 数据dict 或 list
Raises:
JSONDecodeError: 当响应不是有效的 JSON 时
"""
return self._response.json(**kwargs)
def raise_for_status(self):
"""
当状态码非 2xx 时抛出 HTTPError
Raises:
requests.HTTPError: 当状态码表示错误时
"""
self._response.raise_for_status()
# ========== 数据提取方法 ==========
def _get_extractor(self):
"""获取数据提取器(延迟初始化)"""
if not hasattr(self, '_extractor') or self._extractor is None:
from .extract import Extractor
content_type = "json" if self._is_json_response() else "html"
self._extractor = Extractor(self.text, content_type)
return self._extractor
def _is_json_response(self) -> bool:
"""判断是否是 JSON 响应"""
content_type = self.headers.get("content-type", "")
return "application/json" in content_type.lower()
def find(self, selector: str, attr: str = None, strip: bool = True,
regex: str = None, parser=None):
"""
查找第一个匹配的元素(最简单的 API
自动识别选择器类型:
- 以 $ 开头JSONPath
- 以 // 开头XPath
- 其他CSS 选择器
Args:
selector: 选择器CSS/XPath/JSONPath
attr: 要提取的属性名
strip: 是否去除空白
regex: 正则表达式提取
parser: 自定义解析函数
Returns:
匹配的文本或属性值
Example:
>>> response.find("h1") # CSS
>>> response.find("//h1/text()") # XPath
>>> response.find("$.title") # JSONPath
"""
return self._get_extractor().find(selector, attr=attr, strip=strip,
regex=regex, parser=parser)
def find_all(self, selector: str, attr: str = None, strip: bool = True):
"""
查找所有匹配的元素
Args:
selector: 选择器CSS/XPath/JSONPath
attr: 要提取的属性名
strip: 是否去除空白
Returns:
匹配的文本或属性值列表
"""
return self._get_extractor().find_all(selector, attr=attr, strip=strip)
def css(self, selector: str, attr: str = None, html: bool = False, strip: bool = True):
"""
使用 CSS 选择器提取第一个匹配元素
Args:
selector: CSS 选择器
attr: 要提取的属性名
html: 是否返回 HTML 而非文本
strip: 是否去除空白
Returns:
匹配元素的文本、属性或 HTML
"""
return self._get_extractor().css(selector, attr=attr, html=html, strip=strip)
def css_all(self, selector: str, attr: str = None, html: bool = False, strip: bool = True):
"""
使用 CSS 选择器提取所有匹配元素
Args:
selector: CSS 选择器
attr: 要提取的属性名
html: 是否返回 HTML 而非文本
strip: 是否去除空白
Returns:
匹配元素的文本、属性或 HTML 列表
"""
return self._get_extractor().css_all(selector, attr=attr, html=html, strip=strip)
def css_one(self, selector: str):
"""
返回第一个匹配的 Element 对象,支持链式操作
Args:
selector: CSS 选择器
Returns:
Element 对象
"""
return self._get_extractor().css_one(selector)
def xpath(self, expression: str):
"""
使用 XPath 表达式提取第一个匹配
Args:
expression: XPath 表达式
Returns:
匹配的文本或属性值
"""
return self._get_extractor().xpath(expression)
def xpath_all(self, expression: str):
"""
使用 XPath 表达式提取所有匹配
Args:
expression: XPath 表达式
Returns:
匹配的文本或属性值列表
"""
return self._get_extractor().xpath_all(expression)
def xpath_one(self, expression: str):
"""
返回第一个匹配的 Element 对象
Args:
expression: XPath 表达式
Returns:
Element 对象
"""
return self._get_extractor().xpath_one(expression)
def jpath(self, expression: str):
"""
使用 JSONPath 表达式提取第一个匹配
Args:
expression: JSONPath 表达式(如 $.data.items[0].name
Returns:
匹配的值
"""
return self._get_extractor().jpath(expression)
def jpath_all(self, expression: str):
"""
使用 JSONPath 表达式提取所有匹配
Args:
expression: JSONPath 表达式
Returns:
匹配的值列表
"""
return self._get_extractor().jpath_all(expression)
def pick(self, **fields):
"""
批量提取多个字段
Args:
**fields: 字段名=选择器 的映射
- 字符串CSS 选择器,提取文本
- 元组 (selector, attr):提取属性
- 元组 (selector, attr, converter):提取并转换
Returns:
ExtractResult 字典,支持直接保存
Example:
>>> data = response.pick(
... title="h1",
... links=("a", "href"),
... price=(".price", "text", float),
... )
>>> data.save("output.csv")
"""
result = self._get_extractor().pick(**fields)
result.url = str(self.url)
return result
def extract(self, rules: dict):
"""
使用规则字典提取数据(支持前缀指定类型)
Args:
rules: 字段名到选择器的映射
选择器可以带前缀指定类型:
- "css:h1.title" 或直接 "h1.title"
- "xpath://a/@href"
- "jsonpath:$.data.name"
Returns:
ExtractResult 字典
"""
result = self._get_extractor().extract(rules)
result.url = str(self.url)
return result
def save(self, filepath: str, encoding: str = "utf-8"):
"""
保存响应内容到文件
Args:
filepath: 输出文件路径
encoding: 文件编码(仅用于文本内容)
Returns:
输出文件的绝对路径
"""
from .export import save_response
return save_response(self.content, filepath, encoding=encoding)
def request(method, url, cf_proxies=None, uuid=None, http2=False, impersonate=None,
map_output=False, map_file="cfspider_map.html",
stealth=False, stealth_browser='chrome', delay=None,
static_ip=False, two_proxy=None, **kwargs):
"""
发送 HTTP 请求
通过 CFspider Workers VLESS 代理发送请求,完全隐藏 Cloudflare 特征。
Args:
method: HTTP 方法GET, POST, PUT, DELETE, HEAD, OPTIONS, PATCH
url: 目标 URL必须包含协议如 https://
cf_proxies: CFspider Workers 地址(可选)
"https://cfspider.violetqqcom.workers.dev"
不填写时直接请求,不使用代理
uuid: VLESS UUID可选
不填写会自动从 Workers 获取
static_ip: 是否使用固定 IP默认 False
- False: 每次请求获取新的出口 IP适合大规模采集
- True: 保持使用同一个 IP适合需要会话一致性的场景
two_proxy: 第二层代理(可选)
格式: "host:port:user:pass""host:port"
例如: "us.cliproxy.io:3010:username:password"
用于无法直连第二层代理时,通过 Workers 作为跳板
流程: 本地 → Workers (VLESS) → 第二层代理 → 目标网站
http2: 是否启用 HTTP/2 协议(默认 False
impersonate: TLS 指纹模拟(可选)
可选值: chrome131, safari18_0, firefox133, edge101 等
map_output: 是否生成 IP 地图 HTML 文件(默认 False
map_file: 地图输出文件名(默认 "cfspider_map.html"
stealth: 是否启用隐身模式(默认 False
自动添加 15+ 个完整浏览器请求头
stealth_browser: 隐身模式浏览器类型(默认 'chrome'
可选值: chrome, firefox, safari, edge, chrome_mobile
delay: 请求前的随机延迟范围(秒),如 (1, 3)
**kwargs: 其他参数,与 requests 库完全兼容
- params: URL 查询参数
- headers: 自定义请求头
- data: 表单数据
- json: JSON 数据
- cookies: Cookie
- timeout: 超时时间(秒),默认 30
Returns:
CFSpiderResponse: 响应对象 / Response object
- text: 响应文本 / Response text
- content: 响应字节 / Response bytes
- json(): 解析 JSON / Parse JSON
- status_code: HTTP 状态码 / HTTP status code
- headers: 响应头 / Response headers
Examples:
>>> import cfspider
>>>
>>> # 基本 GET 请求(无代理)
>>> response = cfspider.get("https://httpbin.org/ip")
>>> print(response.json())
>>>
>>> # 使用 Workers VLESS 代理(自动获取 UUID每次新 IP
>>> response = cfspider.get(
... "https://httpbin.org/ip",
... cf_proxies="https://cfspider.violetqqcom.workers.dev"
... )
>>> print(response.json()) # 出口 IP
>>>
>>> # 多次请求自动获取不同 IP
>>> for i in range(5):
... response = cfspider.get(
... "https://httpbin.org/ip",
... cf_proxies="https://cfspider.violetqqcom.workers.dev"
... )
... print(response.json()['origin']) # 每次都是不同 IP
"""
# 移除不支持的旧版参数(保持向后兼容)
kwargs.pop("token", None)
# 应用随机延迟
if delay:
from .stealth import random_delay
random_delay(delay[0], delay[1])
# 如果指定了 cf_proxies自动检测 Workers 类型
if cf_proxies:
workers_mode = None
# 支持 WorkersManager 对象
if hasattr(cf_proxies, 'url'):
workers_mode = getattr(cf_proxies, 'mode', None)
if not uuid and hasattr(cf_proxies, 'uuid'):
manager_uuid = getattr(cf_proxies, 'uuid', None)
if manager_uuid:
uuid = manager_uuid
manager_url = getattr(cf_proxies, 'url', None)
if not manager_url:
raise ValueError("WorkersManager 未成功创建 Workers请检查 API Token 和 Account ID")
cf_proxies = manager_url
# 检测是否为爬楼梯 WorkersHTTP 代理模式)
workers_type = workers_mode if workers_mode in ('http', 'vless') else _detect_workers_type(cf_proxies)
if workers_type == 'http':
# 使用爬楼梯 Workers HTTP 代理
return _request_http_proxy(
method, url, cf_proxies,
http2=http2, impersonate=impersonate,
map_output=map_output, map_file=map_file,
stealth=stealth, stealth_browser=stealth_browser,
**kwargs
)
else:
# 使用 VLESS Workers 代理
return _request_vless(
method, url, cf_proxies, uuid,
http2=http2, impersonate=impersonate,
map_output=map_output, map_file=map_file,
stealth=stealth, stealth_browser=stealth_browser,
static_ip=static_ip, two_proxy=two_proxy,
**kwargs
)
# 没有指定代理,直接请求
params = kwargs.pop("params", None)
headers = kwargs.pop("headers", {})
# 如果启用隐身模式,添加完整的浏览器请求头
if stealth:
from .stealth import get_stealth_headers
stealth_headers = get_stealth_headers(stealth_browser)
final_headers = stealth_headers.copy()
final_headers.update(headers)
headers = final_headers
data = kwargs.pop("data", None)
json_data = kwargs.pop("json", None)
cookies = kwargs.pop("cookies", None)
timeout = kwargs.pop("timeout", 30)
# 记录请求开始时间
start_time = time.time()
# 如果指定了 impersonate使用 curl_cffi
if impersonate:
curl_requests = _get_curl_cffi()
response = curl_requests.request(
method,
url,
params=params,
headers=headers,
data=data,
json=json_data,
cookies=cookies,
timeout=timeout,
impersonate=impersonate,
**kwargs
)
resp = CFSpiderResponse(response)
_handle_map_output(resp, url, start_time, map_output, map_file)
return resp
# 如果启用 HTTP/2使用 httpx
if http2:
httpx = _get_httpx()
with httpx.Client(http2=True, timeout=timeout) as client:
response = client.request(
method,
url,
params=params,
headers=headers,
data=data,
json=json_data,
cookies=cookies,
**kwargs
)
resp = CFSpiderResponse(response)
_handle_map_output(resp, url, start_time, map_output, map_file)
return resp
# 默认使用 requests
resp = requests.request(
method,
url,
params=params,
headers=headers,
data=data,
json=json_data,
cookies=cookies,
timeout=timeout,
**kwargs
)
response = CFSpiderResponse(resp)
_handle_map_output(response, url, start_time, map_output, map_file)
return response
def _handle_map_output(response, url, start_time, map_output, map_file):
"""处理 IP 地图输出"""
if not map_output:
return
# 计算响应时间
response_time = (time.time() - start_time) * 1000 # 毫秒
# 收集 IP 记录
ip_map.add_ip_record(
url=url,
ip=None,
cf_colo=getattr(response, 'cf_colo', None),
cf_ray=getattr(response, 'cf_ray', None),
status_code=response.status_code,
response_time=response_time
)
# 生成地图 HTML
ip_map.generate_map_html(output_file=map_file)
def _detect_workers_type(cf_proxies):
"""
自动检测 Workers 类型
通过访问 /health 端点来判断是爬楼梯 WorkersHTTP代理还是 VLESS Workers
Returns:
'http': 爬楼梯 WorkersHTTP 代理模式)
'vless': VLESS Workers
"""
# 支持 WorkersManager 对象
if hasattr(cf_proxies, 'url'):
cf_proxies = getattr(cf_proxies, 'url', None)
# 兜底转换为字符串
if not isinstance(cf_proxies, str):
cf_proxies = str(cf_proxies)
if not cf_proxies:
return 'vless'
# 解析地址
if not cf_proxies.startswith('http'):
cf_proxies = f'https://{cf_proxies}'
cf_proxies = cf_proxies.rstrip('/')
try:
# 尝试访问 /health 端点(爬楼梯 Workers 特有)
response = requests.get(f'{cf_proxies}/health', timeout=5)
if response.status_code == 200:
data = response.json()
if 'status' in data and data.get('status') == 'ok':
return 'http'
except:
pass
try:
# 尝试访问 /proxy 端点(爬楼梯 Workers 特有)
response = requests.get(f'{cf_proxies}/proxy', timeout=5)
if response.status_code == 400: # Missing url parameter
data = response.json()
if 'error' in data and 'url' in data.get('error', '').lower():
return 'http'
except:
pass
# 默认使用 VLESS
return 'vless'
def _request_http_proxy(method, url, http_proxy,
http2=False, impersonate=None,
map_output=False, map_file="cfspider_map.html",
stealth=False, stealth_browser='chrome', **kwargs):
"""
使用爬楼梯 Workers HTTP 代理发送请求
这是一个简单的 HTTP 代理模式,不使用 VLESS 协议,
适合不需要隐藏 Cloudflare 特征的普通爬虫场景。
Args:
method: HTTP 方法
url: 目标 URL
http_proxy: 爬楼梯 Workers 地址
其他参数与 request() 相同
"""
start_time = time.time()
# 支持 WorkersManager 对象
if hasattr(http_proxy, 'url'):
http_proxy = getattr(http_proxy, 'url', None)
# 兜底转换为字符串
if not isinstance(http_proxy, str):
http_proxy = str(http_proxy)
if not http_proxy:
raise ValueError("cf_proxies 不能为空")
# 解析代理地址
if not http_proxy.startswith('http'):
http_proxy = f'https://{http_proxy}'
# 移除末尾斜杠
http_proxy = http_proxy.rstrip('/')
# 构建代理请求
proxy_url = f'{http_proxy}/proxy'
# 准备请求头
headers = kwargs.pop('headers', {})
# 如果启用隐身模式,添加完整的浏览器请求头
if stealth:
from .stealth import get_stealth_headers
stealth_headers = get_stealth_headers(stealth_browser)
final_headers = stealth_headers.copy()
final_headers.update(headers)
headers = final_headers
# 准备请求数据
data = kwargs.pop('data', None)
json_data = kwargs.pop('json', None)
params = kwargs.pop('params', None)
cookies = kwargs.pop('cookies', None)
timeout = kwargs.pop('timeout', 30)
token = kwargs.pop('token', None)
# 构建代理请求体
proxy_body = {
'url': url,
'method': method.upper(),
'headers': headers,
}
# 添加请求体
if data:
proxy_body['body'] = data
elif json_data:
import json
proxy_body['body'] = json.dumps(json_data)
if 'Content-Type' not in headers:
proxy_body['headers']['Content-Type'] = 'application/json'
# 添加查询参数到 URL
if params:
from urllib.parse import urlencode, urlparse, urlunparse, parse_qs
parsed = urlparse(url)
existing_params = parse_qs(parsed.query)
existing_params.update(params if isinstance(params, dict) else dict(params))
new_query = urlencode(existing_params, doseq=True)
proxy_body['url'] = urlunparse(parsed._replace(query=new_query))
# 添加 Cookie
if cookies:
cookie_str = '; '.join([f'{k}={v}' for k, v in cookies.items()])
proxy_body['headers']['Cookie'] = cookie_str
# 准备代理请求头
proxy_headers = {'Content-Type': 'application/json'}
if token:
proxy_headers['Authorization'] = f'Bearer {token}'
# 发送代理请求
import json
response = requests.post(
proxy_url,
json=proxy_body,
headers=proxy_headers,
timeout=timeout,
**kwargs
)
# 包装响应
resp = CFSpiderResponse(response)
_handle_map_output(resp, url, start_time, map_output, map_file)
return resp
# VLESS 本地代理缓存
_vless_proxy_cache = {}
# Workers 配置缓存
_workers_config_cache = {}
def _get_workers_config(cf_proxies):
"""
从 Workers 获取配置UUID、new_ip 等)
返回:
dict: {
'uuid': str,
'host': str,
'new_ip': bool,
...
}
"""
# 解析 Workers 地址
parsed = urlparse(cf_proxies)
if parsed.scheme:
host = parsed.netloc or parsed.path.split('/')[0]
else:
host = cf_proxies.split('/')[0]
# 检查缓存
if host in _workers_config_cache:
return _workers_config_cache[host]
# 尝试从 Workers API 获取配置
try:
workers_url = f"https://{host}"
resp = requests.get(f"{workers_url}/api/config", timeout=10)
if resp.status_code == 200:
config = resp.json()
config['host'] = host
_workers_config_cache[host] = config
return config
except:
pass
# 尝试从 /api/uuid 获取(兼容旧版本)
try:
workers_url = f"https://{host}"
resp = requests.get(f"{workers_url}/api/uuid", timeout=10)
if resp.status_code == 200:
config = resp.json()
config['host'] = host
if 'new_ip' not in config:
config['new_ip'] = True # 默认启用
_workers_config_cache[host] = config
return config
except:
pass
# 如果获取失败,尝试从首页 HTML 解析 UUID
try:
workers_url = f"https://{host}"
resp = requests.get(workers_url, timeout=10)
if resp.status_code == 200:
import re
match = re.search(r'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-4[0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12})', resp.text)
if match:
config = {
'uuid': match.group(1).lower(),
'host': host,
'new_ip': True # 默认启用
}
_workers_config_cache[host] = config
return config
except:
pass
return None
def _request_vless(method, url, cf_proxies, uuid=None,
http2=False, impersonate=None,
map_output=False, map_file="cfspider_map.html",
stealth=False, stealth_browser='chrome',
static_ip=False, two_proxy=None, **kwargs):
"""
使用 VLESS 协议发送请求
通过 CFspider Workers 的 VLESS 协议代理请求,
完全隐藏 Cloudflare 特征CF-Ray、CF-Worker 等头)。
Args:
method: HTTP 方法
url: 目标 URL
cf_proxies: Workers 地址,支持字符串或 WorkersManager 对象
uuid: VLESS UUID可选不填则自动获取
static_ip: 是否使用固定 IP默认 False
- False: 每次请求获取新的出口 IP适合大规模采集
- True: 保持使用同一个 IP适合需要会话一致性的场景
two_proxy: 第二层代理(可选)
格式: "host:port:user:pass"
流程: 本地 → Workers (VLESS) → 第二层代理 → 目标网站
其他参数与 request() 相同
"""
from .vless_client import LocalVlessProxy
import uuid as uuid_mod
# 支持 WorkersManager 对象
workers_manager = None
if hasattr(cf_proxies, 'url') and hasattr(cf_proxies, 'uuid'):
# cf_proxies 是 WorkersManager 对象
workers_manager = cf_proxies
if not uuid and workers_manager.uuid:
uuid = workers_manager.uuid
cf_proxies = workers_manager.url
if not cf_proxies:
raise ValueError("WorkersManager 未成功创建 Workers请检查 API Token 和 Account ID")
# 解析 Workers 地址获取 host
parsed = urlparse(cf_proxies)
if parsed.scheme:
host = parsed.netloc or parsed.path.split('/')[0]
else:
host = cf_proxies.split('/')[0]
# 尝试从 Workers 获取配置
config = _get_workers_config(cf_proxies)
# 如果用户没提供 uuid使用 Workers 配置的 uuid
if not uuid:
if not config:
raise ValueError(
f"无法从 {cf_proxies} 获取配置。\n"
"请确保 Workers 已正确部署,或手动指定 uuid 参数。"
)
uuid = config.get('uuid')
host = config.get('host', host)
if not uuid:
# Workers 配置了自定义 UUID需要用户手动填写
raise ValueError(
f"Workers 配置了自定义 UUID需要手动指定 uuid 参数。\n"
"用法: cfspider.get(url, cf_proxies='...', uuid='你的UUID')\n"
"提示: UUID 可在 Workers 界面或 Cloudflare Dashboard 中查看。"
)
# 自动获取 Workers 配置的 two_proxy
# 优先级:用户传入的 two_proxy > Workers 配置的 two_proxy
if two_proxy is None and config:
# 检查 Workers 是否配置了双层代理
if config.get('two_proxy_enabled'):
# 优先使用完整的 two_proxy 配置
workers_two_proxy = config.get('two_proxy')
if workers_two_proxy:
two_proxy = workers_two_proxy
else:
# 尝试从 vless_path 中解析
vless_path = config.get('vless_path', '')
if 'two_proxy=' in vless_path:
import re
match = re.search(r'two_proxy=([^&]+)', vless_path)
if match:
from urllib.parse import unquote
two_proxy = unquote(match.group(1))
# 构建 VLESS WebSocket URL: wss://host/uuid
vless_url = f"wss://{host}/{uuid}"
# 根据 static_ip 参数决定是否复用连接
if static_ip:
# 固定 IP 模式:复用同一个连接
cache_key = f"{host}:{uuid}:{two_proxy or ''}"
if cache_key in _vless_proxy_cache:
proxy = _vless_proxy_cache[cache_key]
port = proxy.port
else:
proxy = LocalVlessProxy(vless_url, uuid, two_proxy=two_proxy)
port = proxy.start()
_vless_proxy_cache[cache_key] = proxy
else:
# 动态 IP 模式:每次创建新连接
proxy = LocalVlessProxy(vless_url, uuid, two_proxy=two_proxy)
port = proxy.start()
# 构建本地代理 URL
local_proxy = f"http://127.0.0.1:{port}"
# 记录请求开始时间
start_time = time.time()
# 准备请求参数
params = kwargs.pop("params", None)
headers = kwargs.pop("headers", {})
# 如果启用隐身模式,添加完整的浏览器请求头
if stealth:
from .stealth import get_stealth_headers
stealth_headers = get_stealth_headers(stealth_browser)
final_headers = stealth_headers.copy()
final_headers.update(headers)
headers = final_headers
data = kwargs.pop("data", None)
json_data = kwargs.pop("json", None)
cookies = kwargs.pop("cookies", None)
timeout = kwargs.pop("timeout", 30)
proxies = {
"http": local_proxy,
"https": local_proxy
}
# 如果指定了 impersonate使用 curl_cffi
if impersonate:
curl_requests = _get_curl_cffi()
response = curl_requests.request(
method,
url,
params=params,
headers=headers,
data=data,
json=json_data,
cookies=cookies,
timeout=timeout,
impersonate=impersonate,
proxies=proxies,
**kwargs
)
resp = CFSpiderResponse(response)
_handle_map_output(resp, url, start_time, map_output, map_file)
return resp
# 如果启用 HTTP/2使用 httpx
if http2:
httpx = _get_httpx()
with httpx.Client(http2=True, timeout=timeout, proxy=local_proxy) as client:
response = client.request(
method,
url,
params=params,
headers=headers,
data=data,
json=json_data,
cookies=cookies,
**kwargs
)
resp = CFSpiderResponse(response)
_handle_map_output(resp, url, start_time, map_output, map_file)
return resp
# 默认使用 requests
resp = requests.request(
method,
url,
params=params,
headers=headers,
data=data,
json=json_data,
cookies=cookies,
timeout=timeout,
proxies=proxies,
**kwargs
)
response = CFSpiderResponse(resp)
_handle_map_output(response, url, start_time, map_output, map_file)
return response
def stop_vless_proxies():
"""
停止所有 VLESS 本地代理
在程序结束时调用,释放资源。
Example:
>>> import cfspider
>>> # 使用 VLESS 发送请求
>>> response = cfspider.get("https://httpbin.org/ip",
... cf_proxies="https://cfspider.violetqqcom.workers.dev",
... uuid="c373c80c-58e4-4e64-8db5-40096905ec58"
... )
>>> # 程序结束时清理
>>> cfspider.stop_vless_proxies()
"""
for key, proxy in list(_vless_proxy_cache.items()):
try:
proxy.stop()
except:
pass
_vless_proxy_cache.clear()
def get(url, cf_proxies=None, uuid=None, http2=False, impersonate=None,
map_output=False, map_file="cfspider_map.html",
stealth=False, stealth_browser='chrome', delay=None,
static_ip=False, two_proxy=None, **kwargs):
"""
发送 GET 请求 / Send GET request
Args:
url: 目标 URL必须包含协议如 https://
cf_proxies: CFspider Workers 地址(可选)
自动检测 Workers 类型:
- 爬楼梯 Workers: HTTP 代理模式,无敏感特征,适合爬虫
- VLESS Workers: 隐藏 Cloudflare 特征,适合代理软件
uuid: VLESS UUID可选仅 VLESS 模式)
不填写会自动从 Workers 获取
static_ip: 是否使用固定 IP默认 False
- False: 每次请求获取新的出口 IP适合大规模采集
- True: 保持使用同一个 IP适合需要会话一致性的场景
two_proxy: 第二层代理(可选,仅 VLESS 模式)
格式: "host:port:user:pass""host:port"
流程: 本地 → Workers (VLESS) → 第二层代理 → 目标网站
http2: 是否启用 HTTP/2 协议(默认 False
impersonate: TLS 指纹模拟(可选)
可选值: chrome131, safari18_0, firefox133, edge101 等
map_output: 是否生成 IP 地图 HTML 文件(默认 False
map_file: 地图输出文件名(默认 "cfspider_map.html"
stealth: 是否启用隐身模式(默认 False
stealth_browser: 隐身模式浏览器类型(默认 'chrome'
delay: 请求前随机延迟范围(秒),如 (1, 3)
**kwargs: 其他参数,与 requests 库完全兼容
Returns:
CFSpiderResponse: 响应对象
Example:
>>> # 使用爬楼梯 Workers自动检测HTTP 代理模式)
>>> response = cfspider.get(
... "https://httpbin.org/ip",
... cf_proxies="https://my-proxy.workers.dev"
... )
>>>
>>> # 使用 VLESS Workers自动检测
>>> response = cfspider.get(
... "https://httpbin.org/ip",
... cf_proxies="https://cfspider.workers.dev"
... )
"""
return request("GET", url, cf_proxies=cf_proxies, uuid=uuid,
http2=http2, impersonate=impersonate,
map_output=map_output, map_file=map_file,
stealth=stealth, stealth_browser=stealth_browser, delay=delay,
static_ip=static_ip, two_proxy=two_proxy, **kwargs)
def post(url, cf_proxies=None, uuid=None, http2=False, impersonate=None,
map_output=False, map_file="cfspider_map.html",
stealth=False, stealth_browser='chrome', delay=None,
static_ip=False, two_proxy=None, **kwargs):
"""发送 POST 请求 / Send POST request"""
return request("POST", url, cf_proxies=cf_proxies, uuid=uuid,
http2=http2, impersonate=impersonate,
map_output=map_output, map_file=map_file,
stealth=stealth, stealth_browser=stealth_browser, delay=delay,
static_ip=static_ip, two_proxy=two_proxy, **kwargs)
def put(url, cf_proxies=None, uuid=None, http2=False, impersonate=None,
map_output=False, map_file="cfspider_map.html",
stealth=False, stealth_browser='chrome', delay=None,
static_ip=False, two_proxy=None, **kwargs):
"""发送 PUT 请求 / Send PUT request"""
return request("PUT", url, cf_proxies=cf_proxies, uuid=uuid,
http2=http2, impersonate=impersonate,
map_output=map_output, map_file=map_file,
stealth=stealth, stealth_browser=stealth_browser, delay=delay,
static_ip=static_ip, two_proxy=two_proxy, **kwargs)
def delete(url, cf_proxies=None, uuid=None, http2=False, impersonate=None,
map_output=False, map_file="cfspider_map.html",
stealth=False, stealth_browser='chrome', delay=None,
static_ip=False, two_proxy=None, **kwargs):
"""发送 DELETE 请求 / Send DELETE request"""
return request("DELETE", url, cf_proxies=cf_proxies, uuid=uuid,
http2=http2, impersonate=impersonate,
map_output=map_output, map_file=map_file,
stealth=stealth, stealth_browser=stealth_browser, delay=delay,
static_ip=static_ip, two_proxy=two_proxy, **kwargs)
def head(url, cf_proxies=None, uuid=None, http2=False, impersonate=None,
map_output=False, map_file="cfspider_map.html",
stealth=False, stealth_browser='chrome', delay=None,
static_ip=False, two_proxy=None, **kwargs):
"""发送 HEAD 请求 / Send HEAD request"""
return request("HEAD", url, cf_proxies=cf_proxies, uuid=uuid,
http2=http2, impersonate=impersonate,
map_output=map_output, map_file=map_file,
stealth=stealth, stealth_browser=stealth_browser, delay=delay,
static_ip=static_ip, two_proxy=two_proxy, **kwargs)
def options(url, cf_proxies=None, uuid=None, http2=False, impersonate=None,
map_output=False, map_file="cfspider_map.html",
stealth=False, stealth_browser='chrome', delay=None,
static_ip=False, two_proxy=None, **kwargs):
"""发送 OPTIONS 请求 / Send OPTIONS request"""
return request("OPTIONS", url, cf_proxies=cf_proxies, uuid=uuid,
http2=http2, impersonate=impersonate,
map_output=map_output, map_file=map_file,
stealth=stealth, stealth_browser=stealth_browser, delay=delay,
static_ip=static_ip, two_proxy=two_proxy, **kwargs)
def patch(url, cf_proxies=None, uuid=None, http2=False, impersonate=None,
map_output=False, map_file="cfspider_map.html",
stealth=False, stealth_browser='chrome', delay=None,
static_ip=False, two_proxy=None, **kwargs):
"""发送 PATCH 请求 / Send PATCH request"""
return request("PATCH", url, cf_proxies=cf_proxies, uuid=uuid,
http2=http2, impersonate=impersonate,
map_output=map_output, map_file=map_file,
stealth=stealth, stealth_browser=stealth_browser, delay=delay,
static_ip=static_ip, two_proxy=two_proxy, **kwargs)
def clear_map_records():
"""清空 IP 地图记录"""
ip_map.clear_records()
def get_map_collector():
"""获取 IP 地图收集器"""
return ip_map.get_collector()