Files
colleague-skill/tools/feishu_auto_collector.py
titanwings 6a0b31aa6c refactor: restructure to official AgentSkills/Claude Code skill format
- Flatten colleague-creator/ to repo root (repo = skill directory)
- Update SKILL.md frontmatter with official fields: name, description,
  argument-hint, version, user-invocable, allowed-tools
- Move PRD.md → docs/PRD.md
- Add .gitignore, requirements.txt, LICENSE
- Update README and INSTALL docs to reflect new structure and git clone install

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-30 13:37:54 +08:00

606 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
飞书自动采集器
输入同事姓名,自动:
1. 搜索飞书用户,获取 user_id
2. 找到与他共同的群聊,拉取他的消息记录
3. 搜索他创建/编辑的文档和 Wiki
4. 拉取文档内容
5. 拉取多维表格(如有)
6. 输出统一格式,直接进 colleague-creator 分析流程
前置:
python3 feishu_auto_collector.py --setup # 配置 App ID / Secret一次性
用法:
python3 feishu_auto_collector.py --name "张三" --output-dir ./knowledge/zhangsan
python3 feishu_auto_collector.py --name "张三" --msg-limit 1000 --doc-limit 20
"""
from __future__ import annotations
import json
import sys
import time
import argparse
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional
try:
import requests
except ImportError:
print("错误:请先安装 requestspip3 install requests", file=sys.stderr)
sys.exit(1)
CONFIG_PATH = Path.home() / ".colleague-skill" / "feishu_config.json"
BASE_URL = "https://open.feishu.cn/open-apis"
# ─── 配置 ────────────────────────────────────────────────────────────────────
def load_config() -> dict:
if not CONFIG_PATH.exists():
print("未找到配置请先运行python3 feishu_auto_collector.py --setup", file=sys.stderr)
sys.exit(1)
return json.loads(CONFIG_PATH.read_text())
def save_config(config: dict) -> None:
CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
CONFIG_PATH.write_text(json.dumps(config, indent=2, ensure_ascii=False))
def setup_config() -> None:
print("=== 飞书自动采集配置 ===\n")
print("请前往 https://open.feishu.cn 创建企业自建应用,开通以下权限:")
print()
print(" 消息类:")
print(" im:message:readonly 读取消息")
print(" im:chat:readonly 读取群聊信息")
print(" im:chat.members:readonly 读取群成员")
print()
print(" 用户类:")
print(" contact:user.base:readonly 搜索用户")
print()
print(" 文档类:")
print(" docs:doc:readonly 读取文档")
print(" wiki:wiki:readonly 读取知识库")
print(" drive:drive:readonly 搜索云盘文件")
print()
print(" 多维表格:")
print(" bitable:app:readonly 读取多维表格")
print()
app_id = input("App ID (cli_xxx): ").strip()
app_secret = input("App Secret: ").strip()
config = {"app_id": app_id, "app_secret": app_secret}
save_config(config)
print(f"\n✅ 配置已保存到 {CONFIG_PATH}")
# ─── Token ───────────────────────────────────────────────────────────────────
_token_cache: dict = {}
def get_tenant_token(config: dict) -> str:
"""获取 tenant_access_token带缓存有效期约 2 小时)"""
now = time.time()
if _token_cache.get("token") and _token_cache.get("expire", 0) > now + 60:
return _token_cache["token"]
resp = requests.post(
f"{BASE_URL}/auth/v3/tenant_access_token/internal",
json={"app_id": config["app_id"], "app_secret": config["app_secret"]},
timeout=10,
)
data = resp.json()
if data.get("code") != 0:
print(f"获取 token 失败:{data}", file=sys.stderr)
sys.exit(1)
token = data["tenant_access_token"]
_token_cache["token"] = token
_token_cache["expire"] = now + data.get("expire", 7200)
return token
def api_get(path: str, params: dict, config: dict) -> dict:
token = get_tenant_token(config)
resp = requests.get(
f"{BASE_URL}{path}",
params=params,
headers={"Authorization": f"Bearer {token}"},
timeout=15,
)
return resp.json()
def api_post(path: str, body: dict, config: dict) -> dict:
token = get_tenant_token(config)
resp = requests.post(
f"{BASE_URL}{path}",
json=body,
headers={"Authorization": f"Bearer {token}"},
timeout=15,
)
return resp.json()
# ─── 用户搜索 ─────────────────────────────────────────────────────────────────
def find_user(name: str, config: dict) -> Optional[dict]:
"""通过姓名搜索飞书用户"""
print(f" 搜索用户:{name} ...", file=sys.stderr)
data = api_get(
"/search/v1/user",
{"query": name, "page_size": 10},
config,
)
if data.get("code") != 0:
print(f" 搜索用户失败code={data.get('code')}{data.get('msg')}", file=sys.stderr)
return None
users = data.get("data", {}).get("results", [])
if not users:
print(f" 未找到用户:{name}", file=sys.stderr)
return None
if len(users) == 1:
u = users[0]
print(f" 找到用户:{u.get('name')}{u.get('department_path', [''])[0]}", file=sys.stderr)
return u
# 多个结果,让用户选择
print(f"\n 找到 {len(users)} 个结果,请选择:")
for i, u in enumerate(users):
dept = u.get("department_path", [""])
dept_str = dept[0] if dept else ""
print(f" [{i+1}] {u.get('name')} {dept_str} {u.get('user_id', '')}")
choice = input("\n 选择编号(默认 1").strip() or "1"
try:
idx = int(choice) - 1
return users[idx]
except (ValueError, IndexError):
return users[0]
# ─── 消息记录 ─────────────────────────────────────────────────────────────────
def get_chats_with_user(user_open_id: str, config: dict) -> list:
"""找到 bot 和目标用户共同在的群聊"""
print(" 获取群聊列表 ...", file=sys.stderr)
chats = []
page_token = None
while True:
params = {"page_size": 100}
if page_token:
params["page_token"] = page_token
data = api_get("/im/v1/chats", params, config)
if data.get("code") != 0:
print(f" 获取群聊失败:{data.get('msg')}", file=sys.stderr)
break
items = data.get("data", {}).get("items", [])
chats.extend(items)
if not data.get("data", {}).get("has_more"):
break
page_token = data.get("data", {}).get("page_token")
print(f"{len(chats)} 个群聊,检查成员 ...", file=sys.stderr)
# 过滤:目标用户在其中的群
result = []
for chat in chats:
chat_id = chat.get("chat_id")
if not chat_id:
continue
members_data = api_get(
f"/im/v1/chats/{chat_id}/members",
{"page_size": 100},
config,
)
members = members_data.get("data", {}).get("items", [])
for m in members:
if m.get("member_id") == user_open_id or m.get("open_id") == user_open_id:
result.append(chat)
print(f"{chat.get('name', chat_id)}", file=sys.stderr)
break
return result
def fetch_messages_from_chat(
chat_id: str,
user_open_id: str,
limit: int,
config: dict,
) -> list:
"""从指定群聊拉取目标用户的消息"""
messages = []
page_token = None
while len(messages) < limit:
params = {
"container_id_type": "chat",
"container_id": chat_id,
"page_size": 50,
"sort_type": "ByCreateTimeDesc",
}
if page_token:
params["page_token"] = page_token
data = api_get("/im/v1/messages", params, config)
if data.get("code") != 0:
break
items = data.get("data", {}).get("items", [])
if not items:
break
for item in items:
sender = item.get("sender", {})
sender_id = sender.get("id") or sender.get("open_id", "")
if sender_id != user_open_id:
continue
# 解析消息内容
content_raw = item.get("body", {}).get("content", "")
try:
content_obj = json.loads(content_raw)
# 富文本消息
if isinstance(content_obj, dict):
text_parts = []
for line in content_obj.get("content", []):
for seg in line:
if seg.get("tag") in ("text", "a"):
text_parts.append(seg.get("text", ""))
content = " ".join(text_parts)
else:
content = str(content_obj)
except Exception:
content = content_raw
content = content.strip()
if not content or content in ("[图片]", "[文件]", "[表情]", "[语音]"):
continue
ts = item.get("create_time", "")
if ts:
try:
ts = datetime.fromtimestamp(int(ts) / 1000).strftime("%Y-%m-%d %H:%M")
except Exception:
pass
messages.append({"content": content, "time": ts})
if not data.get("data", {}).get("has_more"):
break
page_token = data.get("data", {}).get("page_token")
return messages[:limit]
def collect_messages(
user: dict,
msg_limit: int,
config: dict,
) -> str:
"""采集目标用户的所有消息记录"""
user_open_id = user.get("open_id") or user.get("user_id", "")
name = user.get("name", "")
chats = get_chats_with_user(user_open_id, config)
if not chats:
return f"# 消息记录\n\n未找到与 {name} 共同的群聊(请确认 bot 已被添加到相关群)\n"
all_messages = []
per_chat_limit = max(100, msg_limit // len(chats))
for chat in chats:
chat_id = chat.get("chat_id")
chat_name = chat.get("name", chat_id)
print(f" 拉取「{chat_name}」消息 ...", file=sys.stderr)
msgs = fetch_messages_from_chat(chat_id, user_open_id, per_chat_limit, config)
for m in msgs:
m["chat"] = chat_name
all_messages.extend(msgs)
print(f" 获取 {len(msgs)}", file=sys.stderr)
# 分类输出
long_msgs = [m for m in all_messages if len(m.get("content", "")) > 50]
short_msgs = [m for m in all_messages if len(m.get("content", "")) <= 50]
lines = [
f"# 飞书消息记录(自动采集)",
f"目标:{name}",
f"来源群聊:{', '.join(c.get('name', '') for c in chats)}",
f"{len(all_messages)} 条消息",
"",
"---",
"",
"## 长消息(观点/决策/技术类)",
"",
]
for m in long_msgs:
lines.append(f"[{m.get('time', '')}][{m.get('chat', '')}] {m['content']}")
lines.append("")
lines += ["---", "", "## 日常消息(风格参考)", ""]
for m in short_msgs[:300]:
lines.append(f"[{m.get('time', '')}] {m['content']}")
return "\n".join(lines)
# ─── 文档采集 ─────────────────────────────────────────────────────────────────
def search_docs_by_user(user_open_id: str, name: str, doc_limit: int, config: dict) -> list:
"""搜索目标用户创建或编辑的文档"""
print(f" 搜索 {name} 的文档 ...", file=sys.stderr)
data = api_post(
"/search/v2/message",
{
"query": name,
"search_type": "docs",
"docs_options": {
"creator_ids": [user_open_id],
},
"page_size": doc_limit,
},
config,
)
if data.get("code") != 0:
# fallback用关键词搜索
print(f" 按创建人搜索失败,改用关键词搜索 ...", file=sys.stderr)
data = api_post(
"/search/v2/message",
{
"query": name,
"search_type": "docs",
"page_size": doc_limit,
},
config,
)
docs = []
for item in data.get("data", {}).get("results", []):
doc_info = item.get("docs_info", {})
if doc_info:
docs.append({
"title": doc_info.get("title", ""),
"url": doc_info.get("url", ""),
"type": doc_info.get("docs_type", ""),
"creator": doc_info.get("creator", {}).get("name", ""),
})
print(f" 找到 {len(docs)} 篇文档", file=sys.stderr)
return docs
def fetch_doc_content(doc_token: str, doc_type: str, config: dict) -> str:
"""拉取单篇文档内容"""
if doc_type in ("doc", "docx"):
data = api_get(f"/docx/v1/documents/{doc_token}/raw_content", {}, config)
return data.get("data", {}).get("content", "")
elif doc_type == "wiki":
# 先获取 wiki node 信息
node_data = api_get(f"/wiki/v2/spaces/get_node", {"token": doc_token}, config)
obj_token = node_data.get("data", {}).get("node", {}).get("obj_token", doc_token)
obj_type = node_data.get("data", {}).get("node", {}).get("obj_type", "docx")
return fetch_doc_content(obj_token, obj_type, config)
return ""
def collect_docs(user: dict, doc_limit: int, config: dict) -> str:
"""采集目标用户的文档"""
import re
user_open_id = user.get("open_id") or user.get("user_id", "")
name = user.get("name", "")
docs = search_docs_by_user(user_open_id, name, doc_limit, config)
if not docs:
return f"# 文档内容\n\n未找到 {name} 相关文档\n"
lines = [
f"# 文档内容(自动采集)",
f"目标:{name}",
f"{len(docs)}",
"",
]
for doc in docs:
url = doc.get("url", "")
title = doc.get("title", "无标题")
doc_type = doc.get("type", "")
print(f" 拉取文档:{title} ...", file=sys.stderr)
# 从 URL 提取 token
token_match = re.search(r"/(?:wiki|docx|docs|sheets|base)/([A-Za-z0-9]+)", url)
if not token_match:
continue
doc_token = token_match.group(1)
content = fetch_doc_content(doc_token, doc_type or "docx", config)
if not content or len(content.strip()) < 20:
print(f" 内容为空,跳过", file=sys.stderr)
continue
lines += [
f"---",
f"## 《{title}",
f"链接:{url}",
f"创建人:{doc.get('creator', '')}",
"",
content.strip(),
"",
]
return "\n".join(lines)
# ─── 多维表格 ─────────────────────────────────────────────────────────────────
def collect_bitable(app_token: str, config: dict) -> str:
"""拉取多维表格内容"""
# 获取所有 table
data = api_get(f"/bitable/v1/apps/{app_token}/tables", {"page_size": 100}, config)
tables = data.get("data", {}).get("items", [])
if not tables:
return "(多维表格为空)\n"
lines = []
for table in tables:
table_id = table.get("table_id")
table_name = table.get("name", table_id)
# 获取字段
fields_data = api_get(
f"/bitable/v1/apps/{app_token}/tables/{table_id}/fields",
{"page_size": 100},
config,
)
fields = [f.get("field_name", "") for f in fields_data.get("data", {}).get("items", [])]
# 获取记录
records_data = api_get(
f"/bitable/v1/apps/{app_token}/tables/{table_id}/records",
{"page_size": 100},
config,
)
records = records_data.get("data", {}).get("items", [])
lines.append(f"### 表:{table_name}")
lines.append("")
lines.append("| " + " | ".join(fields) + " |")
lines.append("| " + " | ".join(["---"] * len(fields)) + " |")
for rec in records:
row_data = rec.get("fields", {})
row = []
for f in fields:
val = row_data.get(f, "")
if isinstance(val, list):
val = " ".join(
v.get("text", str(v)) if isinstance(v, dict) else str(v)
for v in val
)
row.append(str(val).replace("|", "").replace("\n", " "))
lines.append("| " + " | ".join(row) + " |")
lines.append("")
return "\n".join(lines)
# ─── 主流程 ───────────────────────────────────────────────────────────────────
def collect_all(
name: str,
output_dir: Path,
msg_limit: int,
doc_limit: int,
config: dict,
) -> dict:
"""采集某同事的所有可用数据,输出到 output_dir"""
output_dir.mkdir(parents=True, exist_ok=True)
results = {}
print(f"\n🔍 开始采集:{name}\n", file=sys.stderr)
# Step 1: 搜索用户
user = find_user(name, config)
if not user:
print(f"❌ 未找到用户 {name},请检查姓名是否正确", file=sys.stderr)
sys.exit(1)
# Step 2: 采集消息记录
print(f"\n📨 采集消息记录(上限 {msg_limit} 条)...", file=sys.stderr)
try:
msg_content = collect_messages(user, msg_limit, config)
msg_path = output_dir / "messages.txt"
msg_path.write_text(msg_content, encoding="utf-8")
results["messages"] = str(msg_path)
print(f" ✅ 消息记录 → {msg_path}", file=sys.stderr)
except Exception as e:
print(f" ⚠️ 消息采集失败:{e}", file=sys.stderr)
# Step 3: 采集文档
print(f"\n📄 采集文档(上限 {doc_limit} 篇)...", file=sys.stderr)
try:
doc_content = collect_docs(user, doc_limit, config)
doc_path = output_dir / "docs.txt"
doc_path.write_text(doc_content, encoding="utf-8")
results["docs"] = str(doc_path)
print(f" ✅ 文档内容 → {doc_path}", file=sys.stderr)
except Exception as e:
print(f" ⚠️ 文档采集失败:{e}", file=sys.stderr)
# 写摘要
summary = {
"name": name,
"user_id": user.get("user_id", ""),
"open_id": user.get("open_id", ""),
"department": user.get("department_path", []),
"collected_at": datetime.now(timezone.utc).isoformat(),
"files": results,
}
(output_dir / "collection_summary.json").write_text(
json.dumps(summary, ensure_ascii=False, indent=2)
)
print(f"\n✅ 采集完成,输出目录:{output_dir}", file=sys.stderr)
return results
def main() -> None:
parser = argparse.ArgumentParser(description="飞书数据自动采集器")
parser.add_argument("--setup", action="store_true", help="初始化配置")
parser.add_argument("--name", help="同事姓名")
parser.add_argument("--output-dir", default=None, help="输出目录(默认 ./knowledge/{name}")
parser.add_argument("--msg-limit", type=int, default=1000, help="最多采集消息条数(默认 1000")
parser.add_argument("--doc-limit", type=int, default=20, help="最多采集文档篇数(默认 20")
args = parser.parse_args()
if args.setup:
setup_config()
return
if not args.name:
parser.error("请提供 --name")
config = load_config()
output_dir = Path(args.output_dir) if args.output_dir else Path(f"./knowledge/{args.name}")
collect_all(
name=args.name,
output_dir=output_dir,
msg_limit=args.msg_limit,
doc_limit=args.doc_limit,
config=config,
)
if __name__ == "__main__":
main()