mirror of
https://github.com/titanwings/colleague-skill.git
synced 2026-04-04 22:59:06 +08:00
功能: - 通过飞书/钉钉自动采集同事的消息、文档、多维表格 - 支持 PDF、邮件、截图等手动上传 - 分析生成 Work Skill(工作能力)和 Persona(人物性格)两部分 - 支持对话纠正和版本管理 - 兼容 OpenClaw 和 Claude Code 双平台 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
375 lines
12 KiB
Python
375 lines
12 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
飞书浏览器抓取器(Playwright 方案)
|
||
|
||
复用本机 Chrome 登录态,无需任何 token,能访问你有权限的所有飞书内容。
|
||
|
||
支持:
|
||
- 飞书文档(docx/docs)
|
||
- 飞书知识库(wiki)
|
||
- 飞书表格(sheets)→ 导出为 CSV
|
||
- 飞书消息记录(指定群聊)
|
||
|
||
安装:
|
||
pip install playwright
|
||
playwright install chromium
|
||
|
||
用法:
|
||
python3 feishu_browser.py --url "https://xxx.feishu.cn/wiki/xxx" --output out.txt
|
||
python3 feishu_browser.py --url "https://xxx.feishu.cn/docx/xxx" --output out.txt
|
||
python3 feishu_browser.py --chat "后端组" --target "张三" --limit 500 --output out.txt
|
||
python3 feishu_browser.py --url "https://xxx.feishu.cn/sheets/xxx" --output out.csv
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import sys
|
||
import time
|
||
import json
|
||
import argparse
|
||
import platform
|
||
from pathlib import Path
|
||
from typing import Optional
|
||
|
||
|
||
def get_default_chrome_profile() -> str:
|
||
"""根据操作系统返回 Chrome 默认 Profile 路径"""
|
||
system = platform.system()
|
||
if system == "Darwin":
|
||
return str(Path.home() / "Library/Application Support/Google/Chrome/Default")
|
||
elif system == "Linux":
|
||
return str(Path.home() / ".config/google-chrome/Default")
|
||
elif system == "Windows":
|
||
import os
|
||
return str(Path(os.environ.get("LOCALAPPDATA", "")) / "Google/Chrome/User Data/Default")
|
||
return str(Path.home() / ".config/google-chrome/Default")
|
||
|
||
|
||
def make_context(playwright, chrome_profile: Optional[str], headless: bool):
|
||
"""创建复用登录态的浏览器上下文"""
|
||
profile = chrome_profile or get_default_chrome_profile()
|
||
try:
|
||
ctx = playwright.chromium.launch_persistent_context(
|
||
user_data_dir=profile,
|
||
headless=headless,
|
||
args=[
|
||
"--disable-blink-features=AutomationControlled",
|
||
"--no-first-run",
|
||
"--no-default-browser-check",
|
||
],
|
||
ignore_default_args=["--enable-automation"],
|
||
viewport={"width": 1280, "height": 900},
|
||
)
|
||
return ctx
|
||
except Exception as e:
|
||
print(f"⚠️ 无法加载 Chrome Profile:{e}", file=sys.stderr)
|
||
print(f" 尝试的路径:{profile}", file=sys.stderr)
|
||
print(" 请用 --chrome-profile 手动指定路径", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
|
||
def detect_page_type(url: str) -> str:
|
||
"""根据 URL 判断飞书页面类型"""
|
||
if "/wiki/" in url:
|
||
return "wiki"
|
||
elif "/docx/" in url or "/docs/" in url:
|
||
return "doc"
|
||
elif "/sheets/" in url or "/spreadsheets/" in url:
|
||
return "sheet"
|
||
elif "/base/" in url:
|
||
return "base"
|
||
else:
|
||
return "unknown"
|
||
|
||
|
||
def fetch_doc(page, url: str) -> str:
|
||
"""抓取飞书文档或 Wiki 的文本内容"""
|
||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||
|
||
# 等待编辑器加载(飞书文档渲染较慢)
|
||
selectors = [
|
||
".docs-reader-content",
|
||
".lark-editor-content",
|
||
"[data-block-type]",
|
||
".doc-render-core",
|
||
".wiki-content",
|
||
".node-doc-content",
|
||
]
|
||
|
||
loaded = False
|
||
for sel in selectors:
|
||
try:
|
||
page.wait_for_selector(sel, timeout=15000)
|
||
loaded = True
|
||
break
|
||
except Exception:
|
||
continue
|
||
|
||
if not loaded:
|
||
# 等待一段时间后直接提取 body 文本
|
||
time.sleep(5)
|
||
|
||
# 额外等待异步内容渲染
|
||
time.sleep(2)
|
||
|
||
# 尝试多个选择器提取正文
|
||
for sel in selectors:
|
||
try:
|
||
el = page.query_selector(sel)
|
||
if el:
|
||
text = el.inner_text()
|
||
if len(text.strip()) > 50:
|
||
return text.strip()
|
||
except Exception:
|
||
continue
|
||
|
||
# fallback:提取整个 body
|
||
text = page.inner_text("body")
|
||
return text.strip()
|
||
|
||
|
||
def fetch_sheet(page, url: str) -> str:
|
||
"""抓取飞书表格,转为 CSV 格式"""
|
||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||
|
||
try:
|
||
page.wait_for_selector(".spreadsheet-container, .sheet-container", timeout=15000)
|
||
except Exception:
|
||
time.sleep(5)
|
||
|
||
time.sleep(3)
|
||
|
||
# 通过 JS 提取表格数据
|
||
data = page.evaluate("""
|
||
() => {
|
||
const rows = [];
|
||
// 尝试从 DOM 提取可见单元格
|
||
const cells = document.querySelectorAll('[data-row][data-col]');
|
||
if (cells.length === 0) return null;
|
||
|
||
const grid = {};
|
||
let maxRow = 0, maxCol = 0;
|
||
cells.forEach(cell => {
|
||
const r = parseInt(cell.getAttribute('data-row'));
|
||
const c = parseInt(cell.getAttribute('data-col'));
|
||
if (!grid[r]) grid[r] = {};
|
||
grid[r][c] = cell.innerText.replace(/\\n/g, ' ').trim();
|
||
maxRow = Math.max(maxRow, r);
|
||
maxCol = Math.max(maxCol, c);
|
||
});
|
||
|
||
for (let r = 0; r <= maxRow; r++) {
|
||
const row = [];
|
||
for (let c = 0; c <= maxCol; c++) {
|
||
row.push(grid[r] && grid[r][c] ? grid[r][c] : '');
|
||
}
|
||
rows.push(row);
|
||
}
|
||
return rows;
|
||
}
|
||
""")
|
||
|
||
if data:
|
||
lines = []
|
||
for row in data:
|
||
lines.append(",".join(f'"{cell}"' for cell in row))
|
||
return "\n".join(lines)
|
||
|
||
# fallback:直接提取文本
|
||
return page.inner_text("body")
|
||
|
||
|
||
def fetch_messages(page, chat_name: str, target_name: str, limit: int = 500) -> str:
|
||
"""
|
||
抓取指定群聊中目标人物的消息记录。
|
||
需要先导航到飞书 Web 版消息页面。
|
||
"""
|
||
# 打开飞书消息页
|
||
page.goto("https://applink.feishu.cn/client/chat/open", wait_until="domcontentloaded", timeout=20000)
|
||
time.sleep(3)
|
||
|
||
# 尝试搜索群聊
|
||
try:
|
||
# 点击搜索
|
||
search_btn = page.query_selector('[data-test-id="search-btn"], .search-button, [placeholder*="搜索"]')
|
||
if search_btn:
|
||
search_btn.click()
|
||
time.sleep(1)
|
||
page.keyboard.type(chat_name)
|
||
time.sleep(2)
|
||
|
||
# 选择第一个结果
|
||
result = page.query_selector('.search-result-item:first-child, .im-search-item:first-child')
|
||
if result:
|
||
result.click()
|
||
time.sleep(2)
|
||
except Exception as e:
|
||
print(f"⚠️ 自动搜索群聊失败:{e}", file=sys.stderr)
|
||
print(f" 请手动导航到「{chat_name}」群聊,然后按回车继续...", file=sys.stderr)
|
||
input()
|
||
|
||
# 向上滚动加载历史消息
|
||
print(f"正在加载消息历史...", file=sys.stderr)
|
||
messages_container = page.query_selector('.message-list, .im-message-list, [data-testid="message-list"]')
|
||
|
||
if messages_container:
|
||
for _ in range(10): # 滚动 10 次
|
||
page.evaluate("el => el.scrollTop = 0", messages_container)
|
||
time.sleep(1.5)
|
||
else:
|
||
for _ in range(10):
|
||
page.keyboard.press("Control+Home")
|
||
time.sleep(1.5)
|
||
|
||
time.sleep(2)
|
||
|
||
# 提取消息
|
||
messages = page.evaluate(f"""
|
||
() => {{
|
||
const target = "{target_name}";
|
||
const results = [];
|
||
|
||
// 常见的消息 DOM 结构
|
||
const msgSelectors = [
|
||
'.message-item',
|
||
'.im-message-item',
|
||
'[data-message-id]',
|
||
'.msg-list-item',
|
||
];
|
||
|
||
let items = [];
|
||
for (const sel of msgSelectors) {{
|
||
items = document.querySelectorAll(sel);
|
||
if (items.length > 0) break;
|
||
}}
|
||
|
||
items.forEach(item => {{
|
||
const senderEl = item.querySelector(
|
||
'.sender-name, .message-sender, [data-testid="sender-name"], .name'
|
||
);
|
||
const contentEl = item.querySelector(
|
||
'.message-content, .msg-content, [data-testid="message-content"], .text-content'
|
||
);
|
||
const timeEl = item.querySelector(
|
||
'.message-time, .msg-time, [data-testid="message-time"], .time'
|
||
);
|
||
|
||
const sender = senderEl ? senderEl.innerText.trim() : '';
|
||
const content = contentEl ? contentEl.innerText.trim() : '';
|
||
const time = timeEl ? timeEl.innerText.trim() : '';
|
||
|
||
if (!content) return;
|
||
if (target && !sender.includes(target)) return;
|
||
|
||
results.push({{ sender, content, time }});
|
||
}});
|
||
|
||
return results.slice(-{limit});
|
||
}}
|
||
""")
|
||
|
||
if not messages:
|
||
print("⚠️ 未能自动提取消息,尝试提取页面文本", file=sys.stderr)
|
||
return page.inner_text("body")
|
||
|
||
# 按权重分类输出
|
||
long_msgs = [m for m in messages if len(m.get("content", "")) > 50]
|
||
short_msgs = [m for m in messages if len(m.get("content", "")) <= 50]
|
||
|
||
lines = [
|
||
f"# 飞书消息记录(浏览器抓取)",
|
||
f"群聊:{chat_name}",
|
||
f"目标人物:{target_name}",
|
||
f"共 {len(messages)} 条消息",
|
||
"",
|
||
"---",
|
||
"",
|
||
"## 长消息(观点/决策类)",
|
||
"",
|
||
]
|
||
for m in long_msgs:
|
||
lines.append(f"[{m.get('time', '')}] {m.get('content', '')}")
|
||
lines.append("")
|
||
|
||
lines += ["---", "", "## 日常消息", ""]
|
||
for m in short_msgs[:200]:
|
||
lines.append(f"[{m.get('time', '')}] {m.get('content', '')}")
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(description="飞书浏览器抓取器(复用 Chrome 登录态)")
|
||
parser.add_argument("--url", help="飞书文档/Wiki/表格链接")
|
||
parser.add_argument("--chat", help="群聊名称(抓取消息记录时使用)")
|
||
parser.add_argument("--target", help="目标人物姓名(只提取此人的消息)")
|
||
parser.add_argument("--limit", type=int, default=500, help="最多抓取消息条数(默认 500)")
|
||
parser.add_argument("--output", default=None, help="输出文件路径(默认打印到 stdout)")
|
||
parser.add_argument("--chrome-profile", default=None, help="Chrome Profile 路径(默认自动检测)")
|
||
parser.add_argument("--headless", action="store_true", help="无头模式(不显示浏览器窗口)")
|
||
parser.add_argument("--show-browser", action="store_true", help="显示浏览器窗口(调试用)")
|
||
|
||
args = parser.parse_args()
|
||
|
||
if not args.url and not args.chat:
|
||
parser.error("请提供 --url(文档链接)或 --chat(群聊名称)")
|
||
|
||
try:
|
||
from playwright.sync_api import sync_playwright
|
||
except ImportError:
|
||
print("错误:请先安装 Playwright:pip install playwright && playwright install chromium", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
headless = args.headless and not args.show_browser
|
||
|
||
print(f"启动浏览器({'无头' if headless else '有界面'}模式)...", file=sys.stderr)
|
||
|
||
with sync_playwright() as p:
|
||
ctx = make_context(p, args.chrome_profile, headless=headless)
|
||
page = ctx.new_page()
|
||
|
||
# 检查是否已登录
|
||
page.goto("https://www.feishu.cn", wait_until="domcontentloaded", timeout=15000)
|
||
time.sleep(2)
|
||
if "login" in page.url.lower() or "signin" in page.url.lower():
|
||
print("⚠️ 检测到未登录状态。", file=sys.stderr)
|
||
print(" 请在打开的浏览器窗口中登录飞书,登录后按回车继续...", file=sys.stderr)
|
||
if headless:
|
||
print(" 提示:请用 --show-browser 参数显示浏览器窗口以完成登录", file=sys.stderr)
|
||
sys.exit(1)
|
||
input()
|
||
|
||
# 根据任务类型执行
|
||
if args.url:
|
||
page_type = detect_page_type(args.url)
|
||
print(f"页面类型:{page_type},开始抓取...", file=sys.stderr)
|
||
|
||
if page_type == "sheet":
|
||
content = fetch_sheet(page, args.url)
|
||
else:
|
||
content = fetch_doc(page, args.url)
|
||
|
||
elif args.chat:
|
||
content = fetch_messages(
|
||
page,
|
||
chat_name=args.chat,
|
||
target_name=args.target or "",
|
||
limit=args.limit,
|
||
)
|
||
|
||
ctx.close()
|
||
|
||
if not content or len(content.strip()) < 10:
|
||
print("⚠️ 未能提取到有效内容", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
if args.output:
|
||
Path(args.output).write_text(content, encoding="utf-8")
|
||
print(f"✅ 已保存到 {args.output}({len(content)} 字符)", file=sys.stderr)
|
||
else:
|
||
print(content)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|