Files
colleague-skill/tools/feishu_browser.py
titanwings 6a0b31aa6c refactor: restructure to official AgentSkills/Claude Code skill format
- Flatten colleague-creator/ to repo root (repo = skill directory)
- Update SKILL.md frontmatter with official fields: name, description,
  argument-hint, version, user-invocable, allowed-tools
- Move PRD.md → docs/PRD.md
- Add .gitignore, requirements.txt, LICENSE
- Update README and INSTALL docs to reflect new structure and git clone install

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-30 13:37:54 +08:00

375 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
飞书浏览器抓取器Playwright 方案)
复用本机 Chrome 登录态,无需任何 token能访问你有权限的所有飞书内容。
支持:
- 飞书文档docx/docs
- 飞书知识库wiki
- 飞书表格sheets→ 导出为 CSV
- 飞书消息记录(指定群聊)
安装:
pip install playwright
playwright install chromium
用法:
python3 feishu_browser.py --url "https://xxx.feishu.cn/wiki/xxx" --output out.txt
python3 feishu_browser.py --url "https://xxx.feishu.cn/docx/xxx" --output out.txt
python3 feishu_browser.py --chat "后端组" --target "张三" --limit 500 --output out.txt
python3 feishu_browser.py --url "https://xxx.feishu.cn/sheets/xxx" --output out.csv
"""
from __future__ import annotations
import sys
import time
import json
import argparse
import platform
from pathlib import Path
from typing import Optional
def get_default_chrome_profile() -> str:
"""根据操作系统返回 Chrome 默认 Profile 路径"""
system = platform.system()
if system == "Darwin":
return str(Path.home() / "Library/Application Support/Google/Chrome/Default")
elif system == "Linux":
return str(Path.home() / ".config/google-chrome/Default")
elif system == "Windows":
import os
return str(Path(os.environ.get("LOCALAPPDATA", "")) / "Google/Chrome/User Data/Default")
return str(Path.home() / ".config/google-chrome/Default")
def make_context(playwright, chrome_profile: Optional[str], headless: bool):
"""创建复用登录态的浏览器上下文"""
profile = chrome_profile or get_default_chrome_profile()
try:
ctx = playwright.chromium.launch_persistent_context(
user_data_dir=profile,
headless=headless,
args=[
"--disable-blink-features=AutomationControlled",
"--no-first-run",
"--no-default-browser-check",
],
ignore_default_args=["--enable-automation"],
viewport={"width": 1280, "height": 900},
)
return ctx
except Exception as e:
print(f"⚠️ 无法加载 Chrome Profile{e}", file=sys.stderr)
print(f" 尝试的路径:{profile}", file=sys.stderr)
print(" 请用 --chrome-profile 手动指定路径", file=sys.stderr)
sys.exit(1)
def detect_page_type(url: str) -> str:
"""根据 URL 判断飞书页面类型"""
if "/wiki/" in url:
return "wiki"
elif "/docx/" in url or "/docs/" in url:
return "doc"
elif "/sheets/" in url or "/spreadsheets/" in url:
return "sheet"
elif "/base/" in url:
return "base"
else:
return "unknown"
def fetch_doc(page, url: str) -> str:
"""抓取飞书文档或 Wiki 的文本内容"""
page.goto(url, wait_until="domcontentloaded", timeout=30000)
# 等待编辑器加载(飞书文档渲染较慢)
selectors = [
".docs-reader-content",
".lark-editor-content",
"[data-block-type]",
".doc-render-core",
".wiki-content",
".node-doc-content",
]
loaded = False
for sel in selectors:
try:
page.wait_for_selector(sel, timeout=15000)
loaded = True
break
except Exception:
continue
if not loaded:
# 等待一段时间后直接提取 body 文本
time.sleep(5)
# 额外等待异步内容渲染
time.sleep(2)
# 尝试多个选择器提取正文
for sel in selectors:
try:
el = page.query_selector(sel)
if el:
text = el.inner_text()
if len(text.strip()) > 50:
return text.strip()
except Exception:
continue
# fallback提取整个 body
text = page.inner_text("body")
return text.strip()
def fetch_sheet(page, url: str) -> str:
"""抓取飞书表格,转为 CSV 格式"""
page.goto(url, wait_until="domcontentloaded", timeout=30000)
try:
page.wait_for_selector(".spreadsheet-container, .sheet-container", timeout=15000)
except Exception:
time.sleep(5)
time.sleep(3)
# 通过 JS 提取表格数据
data = page.evaluate("""
() => {
const rows = [];
// 尝试从 DOM 提取可见单元格
const cells = document.querySelectorAll('[data-row][data-col]');
if (cells.length === 0) return null;
const grid = {};
let maxRow = 0, maxCol = 0;
cells.forEach(cell => {
const r = parseInt(cell.getAttribute('data-row'));
const c = parseInt(cell.getAttribute('data-col'));
if (!grid[r]) grid[r] = {};
grid[r][c] = cell.innerText.replace(/\\n/g, ' ').trim();
maxRow = Math.max(maxRow, r);
maxCol = Math.max(maxCol, c);
});
for (let r = 0; r <= maxRow; r++) {
const row = [];
for (let c = 0; c <= maxCol; c++) {
row.push(grid[r] && grid[r][c] ? grid[r][c] : '');
}
rows.push(row);
}
return rows;
}
""")
if data:
lines = []
for row in data:
lines.append(",".join(f'"{cell}"' for cell in row))
return "\n".join(lines)
# fallback直接提取文本
return page.inner_text("body")
def fetch_messages(page, chat_name: str, target_name: str, limit: int = 500) -> str:
"""
抓取指定群聊中目标人物的消息记录。
需要先导航到飞书 Web 版消息页面。
"""
# 打开飞书消息页
page.goto("https://applink.feishu.cn/client/chat/open", wait_until="domcontentloaded", timeout=20000)
time.sleep(3)
# 尝试搜索群聊
try:
# 点击搜索
search_btn = page.query_selector('[data-test-id="search-btn"], .search-button, [placeholder*="搜索"]')
if search_btn:
search_btn.click()
time.sleep(1)
page.keyboard.type(chat_name)
time.sleep(2)
# 选择第一个结果
result = page.query_selector('.search-result-item:first-child, .im-search-item:first-child')
if result:
result.click()
time.sleep(2)
except Exception as e:
print(f"⚠️ 自动搜索群聊失败:{e}", file=sys.stderr)
print(f" 请手动导航到「{chat_name}」群聊,然后按回车继续...", file=sys.stderr)
input()
# 向上滚动加载历史消息
print(f"正在加载消息历史...", file=sys.stderr)
messages_container = page.query_selector('.message-list, .im-message-list, [data-testid="message-list"]')
if messages_container:
for _ in range(10): # 滚动 10 次
page.evaluate("el => el.scrollTop = 0", messages_container)
time.sleep(1.5)
else:
for _ in range(10):
page.keyboard.press("Control+Home")
time.sleep(1.5)
time.sleep(2)
# 提取消息
messages = page.evaluate(f"""
() => {{
const target = "{target_name}";
const results = [];
// 常见的消息 DOM 结构
const msgSelectors = [
'.message-item',
'.im-message-item',
'[data-message-id]',
'.msg-list-item',
];
let items = [];
for (const sel of msgSelectors) {{
items = document.querySelectorAll(sel);
if (items.length > 0) break;
}}
items.forEach(item => {{
const senderEl = item.querySelector(
'.sender-name, .message-sender, [data-testid="sender-name"], .name'
);
const contentEl = item.querySelector(
'.message-content, .msg-content, [data-testid="message-content"], .text-content'
);
const timeEl = item.querySelector(
'.message-time, .msg-time, [data-testid="message-time"], .time'
);
const sender = senderEl ? senderEl.innerText.trim() : '';
const content = contentEl ? contentEl.innerText.trim() : '';
const time = timeEl ? timeEl.innerText.trim() : '';
if (!content) return;
if (target && !sender.includes(target)) return;
results.push({{ sender, content, time }});
}});
return results.slice(-{limit});
}}
""")
if not messages:
print("⚠️ 未能自动提取消息,尝试提取页面文本", file=sys.stderr)
return page.inner_text("body")
# 按权重分类输出
long_msgs = [m for m in messages if len(m.get("content", "")) > 50]
short_msgs = [m for m in messages if len(m.get("content", "")) <= 50]
lines = [
f"# 飞书消息记录(浏览器抓取)",
f"群聊:{chat_name}",
f"目标人物:{target_name}",
f"{len(messages)} 条消息",
"",
"---",
"",
"## 长消息(观点/决策类)",
"",
]
for m in long_msgs:
lines.append(f"[{m.get('time', '')}] {m.get('content', '')}")
lines.append("")
lines += ["---", "", "## 日常消息", ""]
for m in short_msgs[:200]:
lines.append(f"[{m.get('time', '')}] {m.get('content', '')}")
return "\n".join(lines)
def main() -> None:
parser = argparse.ArgumentParser(description="飞书浏览器抓取器(复用 Chrome 登录态)")
parser.add_argument("--url", help="飞书文档/Wiki/表格链接")
parser.add_argument("--chat", help="群聊名称(抓取消息记录时使用)")
parser.add_argument("--target", help="目标人物姓名(只提取此人的消息)")
parser.add_argument("--limit", type=int, default=500, help="最多抓取消息条数(默认 500")
parser.add_argument("--output", default=None, help="输出文件路径(默认打印到 stdout")
parser.add_argument("--chrome-profile", default=None, help="Chrome Profile 路径(默认自动检测)")
parser.add_argument("--headless", action="store_true", help="无头模式(不显示浏览器窗口)")
parser.add_argument("--show-browser", action="store_true", help="显示浏览器窗口(调试用)")
args = parser.parse_args()
if not args.url and not args.chat:
parser.error("请提供 --url文档链接或 --chat群聊名称")
try:
from playwright.sync_api import sync_playwright
except ImportError:
print("错误:请先安装 Playwrightpip install playwright && playwright install chromium", file=sys.stderr)
sys.exit(1)
headless = args.headless and not args.show_browser
print(f"启动浏览器({'无头' if headless else '有界面'}模式)...", file=sys.stderr)
with sync_playwright() as p:
ctx = make_context(p, args.chrome_profile, headless=headless)
page = ctx.new_page()
# 检查是否已登录
page.goto("https://www.feishu.cn", wait_until="domcontentloaded", timeout=15000)
time.sleep(2)
if "login" in page.url.lower() or "signin" in page.url.lower():
print("⚠️ 检测到未登录状态。", file=sys.stderr)
print(" 请在打开的浏览器窗口中登录飞书,登录后按回车继续...", file=sys.stderr)
if headless:
print(" 提示:请用 --show-browser 参数显示浏览器窗口以完成登录", file=sys.stderr)
sys.exit(1)
input()
# 根据任务类型执行
if args.url:
page_type = detect_page_type(args.url)
print(f"页面类型:{page_type},开始抓取...", file=sys.stderr)
if page_type == "sheet":
content = fetch_sheet(page, args.url)
else:
content = fetch_doc(page, args.url)
elif args.chat:
content = fetch_messages(
page,
chat_name=args.chat,
target_name=args.target or "",
limit=args.limit,
)
ctx.close()
if not content or len(content.strip()) < 10:
print("⚠️ 未能提取到有效内容", file=sys.stderr)
sys.exit(1)
if args.output:
Path(args.output).write_text(content, encoding="utf-8")
print(f"✅ 已保存到 {args.output}{len(content)} 字符)", file=sys.stderr)
else:
print(content)
if __name__ == "__main__":
main()