feat: add unified data processing module (cfspider.read, DataFrame)

- Add cfspider.read() for unified data reading (file, URL, list)
- Add DataFrame class with chainable operations
- Support filter, transform, sort, select, drop, add, unique, dropna, fillna
- Support group().agg() for aggregation
- Support join, concat for merging
- Support save to CSV, JSON, Excel, Parquet
- Support conversion to/from pandas and polars
This commit is contained in:
violettools
2026-01-26 22:21:54 +08:00
parent 70f890bbe6
commit d9efb5b893
4 changed files with 891 additions and 0 deletions

View File

@@ -93,6 +93,9 @@ from .batch import batch, abatch, BatchResult, BatchItem
# 数据导出
from .export import export
# 数据处理模块
from .data import DataFrame, read, read_csv, read_json, read_excel
# 本地代理服务器(支持双层代理)
from .proxy_server import (
generate_vless_link,
@@ -281,4 +284,6 @@ __all__ = [
"generate_vless_link", "start_proxy_server", "TwoProxyServer",
# Workers 管理器
"make_workers", "list_workers", "delete_workers", "WorkersManager",
# 数据处理
"DataFrame", "read", "read_csv", "read_json", "read_excel",
]

16
cfspider/data/__init__.py Normal file
View File

@@ -0,0 +1,16 @@
"""
CFspider Data Processing Module
统一的数据处理模块,可独立使用或与爬虫集成
"""
from .dataframe import DataFrame
from .io import read, read_csv, read_json, read_excel
__all__ = [
'DataFrame',
'read',
'read_csv',
'read_json',
'read_excel',
]

546
cfspider/data/dataframe.py Normal file
View File

@@ -0,0 +1,546 @@
"""
DataFrame - 轻量级数据处理类
"""
import json
import csv
from typing import List, Dict, Any, Callable, Union, Optional, Tuple
from copy import deepcopy
class DataFrame:
"""
轻量级 DataFrame 实现
支持链式操作,与爬虫无缝集成
"""
def __init__(self, data: List[Dict[str, Any]] = None):
"""
创建 DataFrame
Args:
data: 字典列表,如 [{"name": "A", "price": 100}, ...]
"""
self._data = data if data is not None else []
self._columns = self._infer_columns()
def _infer_columns(self) -> List[str]:
"""推断列名"""
if not self._data:
return []
cols = set()
for row in self._data:
cols.update(row.keys())
return list(cols)
@property
def columns(self) -> List[str]:
"""列名列表"""
return self._columns.copy()
@property
def rows(self) -> int:
"""行数"""
return len(self._data)
@property
def shape(self) -> Tuple[int, int]:
"""(行数, 列数)"""
return (len(self._data), len(self._columns))
def __len__(self) -> int:
return len(self._data)
def __iter__(self):
return iter(self._data)
def __getitem__(self, key):
if isinstance(key, str):
# 获取列
return [row.get(key) for row in self._data]
elif isinstance(key, int):
# 获取行
return self._data[key]
elif isinstance(key, slice):
# 切片
return DataFrame(self._data[key])
elif isinstance(key, list):
# 多列选择
return self.select(*key)
raise TypeError(f"不支持的索引类型: {type(key)}")
def __repr__(self) -> str:
return self._format_table()
def _format_table(self, max_rows: int = 20) -> str:
"""格式化为表格字符串"""
if not self._data:
return "DataFrame(空)"
# 计算列宽
cols = self._columns[:10] # 最多显示10列
widths = {}
for col in cols:
values = [str(row.get(col, ""))[:30] for row in self._data[:max_rows]]
widths[col] = max(len(col), max(len(v) for v in values) if values else 0)
# 构建表格
lines = []
# 顶部边框
top = "" + "".join("" * (widths[c] + 2) for c in cols) + ""
lines.append(top)
# 表头
header = "" + "".join(f" {c:^{widths[c]}} " for c in cols) + ""
lines.append(header)
# 分隔线
sep = "" + "".join("" * (widths[c] + 2) for c in cols) + ""
lines.append(sep)
# 数据行
display_data = self._data[:max_rows]
for row in display_data:
values = [str(row.get(c, ""))[:30] for c in cols]
line = "" + "".join(f" {v:<{widths[cols[i]]}} " for i, v in enumerate(values)) + ""
lines.append(line)
# 省略提示
if len(self._data) > max_rows:
lines.append(f"│ ... 共 {len(self._data)} 行,显示前 {max_rows} 行 ...".ljust(sum(widths.values()) + len(cols) * 3) + "")
# 底部边框
bottom = "" + "".join("" * (widths[c] + 2) for c in cols) + ""
lines.append(bottom)
# 添加维度信息
lines.append(f"\n[{len(self._data)} 行 x {len(self._columns)} 列]")
return "\n".join(lines)
# ========== 过滤 ==========
def filter(self, func: Callable = None, **conditions) -> 'DataFrame':
"""
过滤数据
Args:
func: 过滤函数,接收行字典,返回 bool
**conditions: 列条件,如 price=lambda x: x > 100
Examples:
df.filter(lambda row: row["price"] > 100)
df.filter(price=lambda x: x > 100)
df.filter(stock=lambda x: x is not None)
"""
result = self._data.copy()
if func:
result = [row for row in result if func(row)]
for col, cond in conditions.items():
if callable(cond):
result = [row for row in result if cond(row.get(col))]
else:
result = [row for row in result if row.get(col) == cond]
return DataFrame(result)
# ========== 转换 ==========
def transform(self, **transforms) -> 'DataFrame':
"""
转换列值
Args:
**transforms: 列转换,如 price=float, name=str.strip
Examples:
df.transform(price=float)
df.transform(price=lambda x: x * 0.9)
df.transform(name=str.strip, price=lambda x: float(x.replace("$", "")))
"""
result = deepcopy(self._data)
for col, func in transforms.items():
for row in result:
if col in row and row[col] is not None:
try:
row[col] = func(row[col])
except:
pass
return DataFrame(result)
# ========== 排序 ==========
def sort(self, by: Union[str, List[str]], desc: Union[bool, List[bool]] = False) -> 'DataFrame':
"""
排序
Args:
by: 排序列名
desc: 是否降序
Examples:
df.sort("price")
df.sort("price", desc=True)
df.sort(["category", "price"], desc=[False, True])
"""
if isinstance(by, str):
by = [by]
desc = [desc] if isinstance(desc, bool) else desc
def sort_key(row):
values = []
for i, col in enumerate(by):
val = row.get(col)
# 处理 None
if val is None:
val = "" if isinstance(val, str) else float('inf')
values.append(val)
return values
reverse = desc[0] if isinstance(desc, list) else desc
result = sorted(self._data, key=sort_key, reverse=reverse)
return DataFrame(result)
# ========== 选择/删除列 ==========
def select(self, *columns) -> 'DataFrame':
"""
选择列
Examples:
df.select("name", "price")
"""
result = [{col: row.get(col) for col in columns} for row in self._data]
return DataFrame(result)
def drop(self, *columns) -> 'DataFrame':
"""
删除列
Examples:
df.drop("temp_column")
"""
keep = [c for c in self._columns if c not in columns]
return self.select(*keep)
# ========== 添加列 ==========
def add(self, name: str, value: Union[Any, Callable]) -> 'DataFrame':
"""
添加列
Args:
name: 列名
value: 值或函数
Examples:
df.add("total", lambda row: row["price"] * row["quantity"])
df.add("source", "爬虫")
"""
result = deepcopy(self._data)
for row in result:
if callable(value):
row[name] = value(row)
else:
row[name] = value
return DataFrame(result)
# ========== 去重 ==========
def unique(self, *columns) -> 'DataFrame':
"""
去重
Examples:
df.unique()
df.unique("name")
df.unique("name", "category")
"""
if not columns:
columns = self._columns
seen = set()
result = []
for row in self._data:
key = tuple(row.get(c) for c in columns)
if key not in seen:
seen.add(key)
result.append(row)
return DataFrame(result)
# ========== 缺失值处理 ==========
def dropna(self, *columns) -> 'DataFrame':
"""
删除空值行
Examples:
df.dropna()
df.dropna("price")
"""
if not columns:
columns = self._columns
result = [
row for row in self._data
if all(row.get(c) is not None for c in columns)
]
return DataFrame(result)
def fillna(self, **values) -> 'DataFrame':
"""
填充空值
Examples:
df.fillna(price=0, stock="未知")
"""
result = deepcopy(self._data)
for row in result:
for col, val in values.items():
if row.get(col) is None:
row[col] = val
return DataFrame(result)
# ========== 分组聚合 ==========
def group(self, *by) -> 'GroupedDataFrame':
"""
分组
Examples:
df.group("category").agg(count=("name", "count"), avg=("price", "mean"))
"""
return GroupedDataFrame(self._data, list(by))
# ========== 合并 ==========
def join(self, other: 'DataFrame', on: str, how: str = "inner") -> 'DataFrame':
"""
合并两个 DataFrame
Args:
other: 另一个 DataFrame
on: 合并键
how: 合并方式 (inner, left, right, outer)
"""
result = []
other_dict = {row[on]: row for row in other._data if on in row}
for row in self._data:
key = row.get(on)
if key in other_dict:
merged = {**row, **other_dict[key]}
result.append(merged)
elif how in ("left", "outer"):
result.append(row.copy())
if how in ("right", "outer"):
self_keys = {row.get(on) for row in self._data}
for row in other._data:
if row.get(on) not in self_keys:
result.append(row.copy())
return DataFrame(result)
def concat(self, other: 'DataFrame') -> 'DataFrame':
"""
连接两个 DataFrame
"""
return DataFrame(self._data + other._data)
# ========== 切片 ==========
def head(self, n: int = 5) -> 'DataFrame':
"""取前 N 行"""
return DataFrame(self._data[:n])
def tail(self, n: int = 5) -> 'DataFrame':
"""取后 N 行"""
return DataFrame(self._data[-n:])
# ========== 保存 ==========
def save(self, path: str, **options) -> None:
"""
保存到文件
Args:
path: 文件路径,根据后缀自动识别格式
Examples:
df.save("output.csv")
df.save("output.json")
df.save("output.xlsx")
"""
ext = path.rsplit(".", 1)[-1].lower()
if ext == "csv":
self.to_csv(path, **options)
elif ext == "json":
self.to_json(path, **options)
elif ext in ("xlsx", "xls"):
self.to_excel(path, **options)
elif ext == "parquet":
self.to_parquet(path, **options)
else:
raise ValueError(f"不支持的文件格式: {ext}")
def to_csv(self, path: str, encoding: str = "utf-8") -> None:
"""保存为 CSV"""
with open(path, "w", newline="", encoding=encoding) as f:
if self._data:
writer = csv.DictWriter(f, fieldnames=self._columns)
writer.writeheader()
writer.writerows(self._data)
def to_json(self, path: str, indent: int = 2, encoding: str = "utf-8") -> None:
"""保存为 JSON"""
with open(path, "w", encoding=encoding) as f:
json.dump(self._data, f, indent=indent, ensure_ascii=False)
def to_excel(self, path: str, sheet_name: str = "Sheet1") -> None:
"""保存为 Excel需要 openpyxl"""
try:
from openpyxl import Workbook
except ImportError:
raise ImportError("保存 Excel 需要安装 openpyxl: pip install openpyxl")
wb = Workbook()
ws = wb.active
ws.title = sheet_name
# 写入表头
for col, name in enumerate(self._columns, 1):
ws.cell(row=1, column=col, value=name)
# 写入数据
for row_idx, row in enumerate(self._data, 2):
for col_idx, col_name in enumerate(self._columns, 1):
ws.cell(row=row_idx, column=col_idx, value=row.get(col_name))
wb.save(path)
def to_parquet(self, path: str) -> None:
"""保存为 Parquet需要 pyarrow"""
try:
import pyarrow as pa
import pyarrow.parquet as pq
except ImportError:
raise ImportError("保存 Parquet 需要安装 pyarrow: pip install pyarrow")
table = pa.Table.from_pylist(self._data)
pq.write_table(table, path)
# ========== 转换 ==========
def to_list(self) -> List[Dict[str, Any]]:
"""转为字典列表"""
return deepcopy(self._data)
def to_dict(self) -> Dict[str, List[Any]]:
"""转为列字典"""
return {col: [row.get(col) for row in self._data] for col in self._columns}
def to_pandas(self):
"""转为 pandas DataFrame"""
try:
import pandas as pd
except ImportError:
raise ImportError("需要安装 pandas: pip install pandas")
return pd.DataFrame(self._data)
def to_polars(self):
"""转为 polars DataFrame"""
try:
import polars as pl
except ImportError:
raise ImportError("需要安装 polars: pip install polars")
return pl.DataFrame(self._data)
# ========== 类方法 ==========
@classmethod
def from_pandas(cls, df) -> 'DataFrame':
"""从 pandas DataFrame 创建"""
return cls(df.to_dict(orient="records"))
@classmethod
def from_polars(cls, df) -> 'DataFrame':
"""从 polars DataFrame 创建"""
return cls(df.to_dicts())
class GroupedDataFrame:
"""分组后的 DataFrame"""
def __init__(self, data: List[Dict], by: List[str]):
self._data = data
self._by = by
self._groups = self._make_groups()
def _make_groups(self) -> Dict[tuple, List[Dict]]:
"""创建分组"""
groups = {}
for row in self._data:
key = tuple(row.get(col) for col in self._by)
if key not in groups:
groups[key] = []
groups[key].append(row)
return groups
def agg(self, **aggregations) -> DataFrame:
"""
聚合
Args:
**aggregations: 聚合定义,如 count=("name", "count"), avg=("price", "mean")
Examples:
df.group("category").agg(
count=("name", "count"),
avg_price=("price", "mean"),
total=("stock", "sum")
)
"""
result = []
for key, rows in self._groups.items():
row_result = dict(zip(self._by, key))
for name, (col, func) in aggregations.items():
values = [r.get(col) for r in rows if r.get(col) is not None]
if func == "count":
row_result[name] = len(values)
elif func == "sum":
row_result[name] = sum(values) if values else 0
elif func == "mean":
row_result[name] = sum(values) / len(values) if values else None
elif func == "min":
row_result[name] = min(values) if values else None
elif func == "max":
row_result[name] = max(values) if values else None
elif func == "first":
row_result[name] = values[0] if values else None
elif func == "last":
row_result[name] = values[-1] if values else None
elif callable(func):
row_result[name] = func(values)
result.append(row_result)
return DataFrame(result)

324
cfspider/data/io.py Normal file
View File

@@ -0,0 +1,324 @@
"""
IO 模块 - 统一的数据读取入口
"""
import json
import csv
import os
from typing import List, Dict, Any, Union
from .dataframe import DataFrame
def read(source: Any, pick: Dict[str, str] = None, **options) -> DataFrame:
"""
统一读取数据入口
Args:
source: 数据来源,可以是:
- 文件路径: "data.csv", "data.json", "data.xlsx"
- URL: "https://..." (自动爬取)
- URL列表: ["url1", "url2"] (批量爬取)
- 字典列表: [{"a": 1}, {"a": 2}]
- pandas DataFrame
- polars DataFrame
pick: 爬取时的数据提取规则
**options: 其他选项
Returns:
DataFrame
Examples:
# 从文件读取
df = read("data.csv")
df = read("data.json")
# 从 URL 爬取
df = read("https://example.com", pick={"title": "h1", "price": ".price"})
# 批量爬取
df = read(["url1", "url2"], pick={"title": "h1"})
# 从字典列表创建
df = read([{"name": "A", "price": 100}])
"""
# 1. 字典列表
if isinstance(source, list) and source and isinstance(source[0], dict):
return DataFrame(source)
# 2. 空列表
if isinstance(source, list) and not source:
return DataFrame([])
# 3. URL 列表(爬取)
if isinstance(source, list) and source and isinstance(source[0], str):
if source[0].startswith(("http://", "https://")):
return _read_urls(source, pick, **options)
else:
# 可能是文件路径列表
dfs = [read(path, **options) for path in source]
result = dfs[0]
for df in dfs[1:]:
result = result.concat(df)
return result
# 4. 字符串
if isinstance(source, str):
# 4.1 URL
if source.startswith(("http://", "https://")):
return _read_url(source, pick, **options)
# 4.2 文件路径
if os.path.exists(source):
return _read_file(source, **options)
raise FileNotFoundError(f"文件不存在: {source}")
# 5. pandas DataFrame
if hasattr(source, "to_dict") and hasattr(source, "columns"):
return DataFrame.from_pandas(source)
# 6. polars DataFrame
if hasattr(source, "to_dicts"):
return DataFrame.from_polars(source)
raise TypeError(f"不支持的数据源类型: {type(source)}")
def _read_file(path: str, **options) -> DataFrame:
"""读取文件"""
ext = path.rsplit(".", 1)[-1].lower()
if ext == "csv":
return read_csv(path, **options)
elif ext == "json":
return read_json(path, **options)
elif ext in ("xlsx", "xls"):
return read_excel(path, **options)
elif ext == "parquet":
return read_parquet(path, **options)
else:
raise ValueError(f"不支持的文件格式: {ext}")
def read_csv(path: str, encoding: str = "utf-8", **options) -> DataFrame:
"""
读取 CSV 文件
Args:
path: 文件路径
encoding: 编码
"""
data = []
with open(path, "r", encoding=encoding, newline="") as f:
reader = csv.DictReader(f)
for row in reader:
# 尝试转换数值
parsed_row = {}
for k, v in row.items():
if v == "":
parsed_row[k] = None
else:
try:
# 尝试转为整数
if "." not in v:
parsed_row[k] = int(v)
else:
parsed_row[k] = float(v)
except (ValueError, TypeError):
parsed_row[k] = v
data.append(parsed_row)
return DataFrame(data)
def read_json(path: str, encoding: str = "utf-8", **options) -> DataFrame:
"""
读取 JSON 文件
Args:
path: 文件路径
encoding: 编码
"""
with open(path, "r", encoding=encoding) as f:
data = json.load(f)
# 确保是列表
if isinstance(data, dict):
# 可能是 {"data": [...]} 格式
for key in ("data", "items", "results", "records"):
if key in data and isinstance(data[key], list):
data = data[key]
break
else:
data = [data]
return DataFrame(data)
def read_excel(path: str, sheet_name: str = None, **options) -> DataFrame:
"""
读取 Excel 文件
Args:
path: 文件路径
sheet_name: 工作表名称
"""
try:
from openpyxl import load_workbook
except ImportError:
raise ImportError("读取 Excel 需要安装 openpyxl: pip install openpyxl")
wb = load_workbook(path, read_only=True)
ws = wb[sheet_name] if sheet_name else wb.active
rows = list(ws.iter_rows(values_only=True))
if not rows:
return DataFrame([])
headers = [str(h) if h else f"col_{i}" for i, h in enumerate(rows[0])]
data = []
for row in rows[1:]:
data.append(dict(zip(headers, row)))
return DataFrame(data)
def read_parquet(path: str, **options) -> DataFrame:
"""
读取 Parquet 文件
Args:
path: 文件路径
"""
try:
import pyarrow.parquet as pq
except ImportError:
raise ImportError("读取 Parquet 需要安装 pyarrow: pip install pyarrow")
table = pq.read_table(path)
data = table.to_pylist()
return DataFrame(data)
def _read_url(url: str, pick: Dict[str, str] = None, **options) -> DataFrame:
"""爬取单个 URL"""
try:
from .. import api as cfspider_api
except ImportError:
raise ImportError("爬取功能需要 cfspider 主模块")
# 获取代理配置
cf_proxies = options.pop("cf_proxies", None)
uuid = options.pop("uuid", None)
headers = options.pop("headers", None)
# 发送请求
response = cfspider_api.get(
url,
cf_proxies=cf_proxies,
uuid=uuid,
headers=headers,
**options
)
# 如果没有提取规则,返回原始内容
if not pick:
try:
# 尝试解析为 JSON
data = response.json()
if isinstance(data, list):
return DataFrame(data)
return DataFrame([data])
except:
return DataFrame([{"url": url, "content": response.text}])
# 使用提取规则
row = {"url": url}
for name, selector in pick.items():
try:
if hasattr(response, "find"):
row[name] = response.find(selector)
else:
# 简单的 CSS 选择器支持
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
elem = soup.select_one(selector)
row[name] = elem.get_text(strip=True) if elem else None
except:
row[name] = None
return DataFrame([row])
def _read_urls(urls: List[str], pick: Dict[str, str] = None, **options) -> DataFrame:
"""批量爬取多个 URL"""
try:
from .. import api as cfspider_api
except ImportError:
raise ImportError("爬取功能需要 cfspider 主模块")
# 获取配置
cf_proxies = options.pop("cf_proxies", None)
uuid = options.pop("uuid", None)
concurrency = options.pop("concurrency", 5)
delay = options.pop("delay", 0)
progress = options.pop("progress", True)
results = []
# 简单的串行爬取(可以后续优化为并发)
for i, url in enumerate(urls):
if progress:
print(f"\r爬取进度: {i+1}/{len(urls)}", end="", flush=True)
try:
response = cfspider_api.get(
url,
cf_proxies=cf_proxies,
uuid=uuid,
**options
)
if not pick:
try:
data = response.json()
if isinstance(data, dict):
data["url"] = url
results.append(data)
else:
results.append({"url": url, "content": response.text})
except:
results.append({"url": url, "content": response.text})
else:
row = {"url": url}
for name, selector in pick.items():
try:
if hasattr(response, "find"):
row[name] = response.find(selector)
else:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
elem = soup.select_one(selector)
row[name] = elem.get_text(strip=True) if elem else None
except:
row[name] = None
results.append(row)
except Exception as e:
results.append({"url": url, "error": str(e)})
# 延迟
if delay and i < len(urls) - 1:
import time
if isinstance(delay, (list, tuple)):
import random
time.sleep(random.uniform(delay[0], delay[1]))
else:
time.sleep(delay)
if progress:
print() # 换行
return DataFrame(results)