#!/usr/bin/env python3 """ 飞书自动采集器 输入同事姓名,自动: 1. 搜索飞书用户,获取 user_id 2. 找到与他共同的群聊,拉取他的消息记录 3. 搜索他创建/编辑的文档和 Wiki 4. 拉取文档内容 5. 拉取多维表格(如有) 6. 输出统一格式,直接进 colleague-creator 分析流程 前置: python3 feishu_auto_collector.py --setup # 配置 App ID / Secret(一次性) 用法: python3 feishu_auto_collector.py --name "张三" --output-dir ./knowledge/zhangsan python3 feishu_auto_collector.py --name "张三" --msg-limit 1000 --doc-limit 20 """ from __future__ import annotations import json import sys import time import argparse from pathlib import Path from datetime import datetime, timezone from typing import Optional try: import requests except ImportError: print("错误:请先安装 requests:pip3 install requests", file=sys.stderr) sys.exit(1) CONFIG_PATH = Path.home() / ".colleague-skill" / "feishu_config.json" BASE_URL = "https://open.feishu.cn/open-apis" # ─── 配置 ──────────────────────────────────────────────────────────────────── def load_config() -> dict: if not CONFIG_PATH.exists(): print("未找到配置,请先运行:python3 feishu_auto_collector.py --setup", file=sys.stderr) sys.exit(1) return json.loads(CONFIG_PATH.read_text()) def save_config(config: dict) -> None: CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True) CONFIG_PATH.write_text(json.dumps(config, indent=2, ensure_ascii=False)) def setup_config() -> None: print("=== 飞书自动采集配置 ===\n") print("请前往 https://open.feishu.cn 创建企业自建应用,开通以下权限:") print() print(" 消息类:") print(" im:message:readonly 读取消息") print(" im:chat:readonly 读取群聊信息") print(" im:chat.members:readonly 读取群成员") print() print(" 用户类:") print(" contact:user.base:readonly 搜索用户") print() print(" 文档类:") print(" docs:doc:readonly 读取文档") print(" wiki:wiki:readonly 读取知识库") print(" drive:drive:readonly 搜索云盘文件") print() print(" 多维表格:") print(" bitable:app:readonly 读取多维表格") print() app_id = input("App ID (cli_xxx): ").strip() app_secret = input("App Secret: ").strip() config = {"app_id": app_id, "app_secret": app_secret} save_config(config) print(f"\n✅ 配置已保存到 {CONFIG_PATH}") # ─── Token ─────────────────────────────────────────────────────────────────── _token_cache: dict = {} def get_tenant_token(config: dict) -> str: """获取 tenant_access_token,带缓存(有效期约 2 小时)""" now = time.time() if _token_cache.get("token") and _token_cache.get("expire", 0) > now + 60: return _token_cache["token"] resp = requests.post( f"{BASE_URL}/auth/v3/tenant_access_token/internal", json={"app_id": config["app_id"], "app_secret": config["app_secret"]}, timeout=10, ) data = resp.json() if data.get("code") != 0: print(f"获取 token 失败:{data}", file=sys.stderr) sys.exit(1) token = data["tenant_access_token"] _token_cache["token"] = token _token_cache["expire"] = now + data.get("expire", 7200) return token def api_get(path: str, params: dict, config: dict) -> dict: token = get_tenant_token(config) resp = requests.get( f"{BASE_URL}{path}", params=params, headers={"Authorization": f"Bearer {token}"}, timeout=15, ) return resp.json() def api_post(path: str, body: dict, config: dict) -> dict: token = get_tenant_token(config) resp = requests.post( f"{BASE_URL}{path}", json=body, headers={"Authorization": f"Bearer {token}"}, timeout=15, ) return resp.json() # ─── 用户搜索 ───────────────────────────────────────────────────────────────── def find_user(name: str, config: dict) -> Optional[dict]: """通过姓名搜索飞书用户""" print(f" 搜索用户:{name} ...", file=sys.stderr) data = api_get( "/search/v1/user", {"query": name, "page_size": 10}, config, ) if data.get("code") != 0: print(f" 搜索用户失败(code={data.get('code')}):{data.get('msg')}", file=sys.stderr) return None users = data.get("data", {}).get("results", []) if not users: print(f" 未找到用户:{name}", file=sys.stderr) return None if len(users) == 1: u = users[0] print(f" 找到用户:{u.get('name')}({u.get('department_path', [''])[0]})", file=sys.stderr) return u # 多个结果,让用户选择 print(f"\n 找到 {len(users)} 个结果,请选择:") for i, u in enumerate(users): dept = u.get("department_path", [""]) dept_str = dept[0] if dept else "" print(f" [{i+1}] {u.get('name')} {dept_str} {u.get('user_id', '')}") choice = input("\n 选择编号(默认 1):").strip() or "1" try: idx = int(choice) - 1 return users[idx] except (ValueError, IndexError): return users[0] # ─── 消息记录 ───────────────────────────────────────────────────────────────── def get_chats_with_user(user_open_id: str, config: dict) -> list: """找到 bot 和目标用户共同在的群聊""" print(" 获取群聊列表 ...", file=sys.stderr) chats = [] page_token = None while True: params = {"page_size": 100} if page_token: params["page_token"] = page_token data = api_get("/im/v1/chats", params, config) if data.get("code") != 0: print(f" 获取群聊失败:{data.get('msg')}", file=sys.stderr) break items = data.get("data", {}).get("items", []) chats.extend(items) if not data.get("data", {}).get("has_more"): break page_token = data.get("data", {}).get("page_token") print(f" 共 {len(chats)} 个群聊,检查成员 ...", file=sys.stderr) # 过滤:目标用户在其中的群 result = [] for chat in chats: chat_id = chat.get("chat_id") if not chat_id: continue members_data = api_get( f"/im/v1/chats/{chat_id}/members", {"page_size": 100}, config, ) members = members_data.get("data", {}).get("items", []) for m in members: if m.get("member_id") == user_open_id or m.get("open_id") == user_open_id: result.append(chat) print(f" ✓ {chat.get('name', chat_id)}", file=sys.stderr) break return result def fetch_messages_from_chat( chat_id: str, user_open_id: str, limit: int, config: dict, ) -> list: """从指定群聊拉取目标用户的消息""" messages = [] page_token = None while len(messages) < limit: params = { "container_id_type": "chat", "container_id": chat_id, "page_size": 50, "sort_type": "ByCreateTimeDesc", } if page_token: params["page_token"] = page_token data = api_get("/im/v1/messages", params, config) if data.get("code") != 0: break items = data.get("data", {}).get("items", []) if not items: break for item in items: sender = item.get("sender", {}) sender_id = sender.get("id") or sender.get("open_id", "") if sender_id != user_open_id: continue # 解析消息内容 content_raw = item.get("body", {}).get("content", "") try: content_obj = json.loads(content_raw) # 富文本消息 if isinstance(content_obj, dict): text_parts = [] for line in content_obj.get("content", []): for seg in line: if seg.get("tag") in ("text", "a"): text_parts.append(seg.get("text", "")) content = " ".join(text_parts) else: content = str(content_obj) except Exception: content = content_raw content = content.strip() if not content or content in ("[图片]", "[文件]", "[表情]", "[语音]"): continue ts = item.get("create_time", "") if ts: try: ts = datetime.fromtimestamp(int(ts) / 1000).strftime("%Y-%m-%d %H:%M") except Exception: pass messages.append({"content": content, "time": ts}) if not data.get("data", {}).get("has_more"): break page_token = data.get("data", {}).get("page_token") return messages[:limit] def collect_messages( user: dict, msg_limit: int, config: dict, ) -> str: """采集目标用户的所有消息记录""" user_open_id = user.get("open_id") or user.get("user_id", "") name = user.get("name", "") chats = get_chats_with_user(user_open_id, config) if not chats: return f"# 消息记录\n\n未找到与 {name} 共同的群聊(请确认 bot 已被添加到相关群)\n" all_messages = [] per_chat_limit = max(100, msg_limit // len(chats)) for chat in chats: chat_id = chat.get("chat_id") chat_name = chat.get("name", chat_id) print(f" 拉取「{chat_name}」消息 ...", file=sys.stderr) msgs = fetch_messages_from_chat(chat_id, user_open_id, per_chat_limit, config) for m in msgs: m["chat"] = chat_name all_messages.extend(msgs) print(f" 获取 {len(msgs)} 条", file=sys.stderr) # 分类输出 long_msgs = [m for m in all_messages if len(m.get("content", "")) > 50] short_msgs = [m for m in all_messages if len(m.get("content", "")) <= 50] lines = [ f"# 飞书消息记录(自动采集)", f"目标:{name}", f"来源群聊:{', '.join(c.get('name', '') for c in chats)}", f"共 {len(all_messages)} 条消息", "", "---", "", "## 长消息(观点/决策/技术类)", "", ] for m in long_msgs: lines.append(f"[{m.get('time', '')}][{m.get('chat', '')}] {m['content']}") lines.append("") lines += ["---", "", "## 日常消息(风格参考)", ""] for m in short_msgs[:300]: lines.append(f"[{m.get('time', '')}] {m['content']}") return "\n".join(lines) # ─── 文档采集 ───────────────────────────────────────────────────────────────── def search_docs_by_user(user_open_id: str, name: str, doc_limit: int, config: dict) -> list: """搜索目标用户创建或编辑的文档""" print(f" 搜索 {name} 的文档 ...", file=sys.stderr) data = api_post( "/search/v2/message", { "query": name, "search_type": "docs", "docs_options": { "creator_ids": [user_open_id], }, "page_size": doc_limit, }, config, ) if data.get("code") != 0: # fallback:用关键词搜索 print(f" 按创建人搜索失败,改用关键词搜索 ...", file=sys.stderr) data = api_post( "/search/v2/message", { "query": name, "search_type": "docs", "page_size": doc_limit, }, config, ) docs = [] for item in data.get("data", {}).get("results", []): doc_info = item.get("docs_info", {}) if doc_info: docs.append({ "title": doc_info.get("title", ""), "url": doc_info.get("url", ""), "type": doc_info.get("docs_type", ""), "creator": doc_info.get("creator", {}).get("name", ""), }) print(f" 找到 {len(docs)} 篇文档", file=sys.stderr) return docs def fetch_doc_content(doc_token: str, doc_type: str, config: dict) -> str: """拉取单篇文档内容""" if doc_type in ("doc", "docx"): data = api_get(f"/docx/v1/documents/{doc_token}/raw_content", {}, config) return data.get("data", {}).get("content", "") elif doc_type == "wiki": # 先获取 wiki node 信息 node_data = api_get(f"/wiki/v2/spaces/get_node", {"token": doc_token}, config) obj_token = node_data.get("data", {}).get("node", {}).get("obj_token", doc_token) obj_type = node_data.get("data", {}).get("node", {}).get("obj_type", "docx") return fetch_doc_content(obj_token, obj_type, config) return "" def collect_docs(user: dict, doc_limit: int, config: dict) -> str: """采集目标用户的文档""" import re user_open_id = user.get("open_id") or user.get("user_id", "") name = user.get("name", "") docs = search_docs_by_user(user_open_id, name, doc_limit, config) if not docs: return f"# 文档内容\n\n未找到 {name} 相关文档\n" lines = [ f"# 文档内容(自动采集)", f"目标:{name}", f"共 {len(docs)} 篇", "", ] for doc in docs: url = doc.get("url", "") title = doc.get("title", "无标题") doc_type = doc.get("type", "") print(f" 拉取文档:{title} ...", file=sys.stderr) # 从 URL 提取 token token_match = re.search(r"/(?:wiki|docx|docs|sheets|base)/([A-Za-z0-9]+)", url) if not token_match: continue doc_token = token_match.group(1) content = fetch_doc_content(doc_token, doc_type or "docx", config) if not content or len(content.strip()) < 20: print(f" 内容为空,跳过", file=sys.stderr) continue lines += [ f"---", f"## 《{title}》", f"链接:{url}", f"创建人:{doc.get('creator', '')}", "", content.strip(), "", ] return "\n".join(lines) # ─── 多维表格 ───────────────────────────────────────────────────────────────── def collect_bitable(app_token: str, config: dict) -> str: """拉取多维表格内容""" # 获取所有 table data = api_get(f"/bitable/v1/apps/{app_token}/tables", {"page_size": 100}, config) tables = data.get("data", {}).get("items", []) if not tables: return "(多维表格为空)\n" lines = [] for table in tables: table_id = table.get("table_id") table_name = table.get("name", table_id) # 获取字段 fields_data = api_get( f"/bitable/v1/apps/{app_token}/tables/{table_id}/fields", {"page_size": 100}, config, ) fields = [f.get("field_name", "") for f in fields_data.get("data", {}).get("items", [])] # 获取记录 records_data = api_get( f"/bitable/v1/apps/{app_token}/tables/{table_id}/records", {"page_size": 100}, config, ) records = records_data.get("data", {}).get("items", []) lines.append(f"### 表:{table_name}") lines.append("") lines.append("| " + " | ".join(fields) + " |") lines.append("| " + " | ".join(["---"] * len(fields)) + " |") for rec in records: row_data = rec.get("fields", {}) row = [] for f in fields: val = row_data.get(f, "") if isinstance(val, list): val = " ".join( v.get("text", str(v)) if isinstance(v, dict) else str(v) for v in val ) row.append(str(val).replace("|", "|").replace("\n", " ")) lines.append("| " + " | ".join(row) + " |") lines.append("") return "\n".join(lines) # ─── 主流程 ─────────────────────────────────────────────────────────────────── def collect_all( name: str, output_dir: Path, msg_limit: int, doc_limit: int, config: dict, ) -> dict: """采集某同事的所有可用数据,输出到 output_dir""" output_dir.mkdir(parents=True, exist_ok=True) results = {} print(f"\n🔍 开始采集:{name}\n", file=sys.stderr) # Step 1: 搜索用户 user = find_user(name, config) if not user: print(f"❌ 未找到用户 {name},请检查姓名是否正确", file=sys.stderr) sys.exit(1) # Step 2: 采集消息记录 print(f"\n📨 采集消息记录(上限 {msg_limit} 条)...", file=sys.stderr) try: msg_content = collect_messages(user, msg_limit, config) msg_path = output_dir / "messages.txt" msg_path.write_text(msg_content, encoding="utf-8") results["messages"] = str(msg_path) print(f" ✅ 消息记录 → {msg_path}", file=sys.stderr) except Exception as e: print(f" ⚠️ 消息采集失败:{e}", file=sys.stderr) # Step 3: 采集文档 print(f"\n📄 采集文档(上限 {doc_limit} 篇)...", file=sys.stderr) try: doc_content = collect_docs(user, doc_limit, config) doc_path = output_dir / "docs.txt" doc_path.write_text(doc_content, encoding="utf-8") results["docs"] = str(doc_path) print(f" ✅ 文档内容 → {doc_path}", file=sys.stderr) except Exception as e: print(f" ⚠️ 文档采集失败:{e}", file=sys.stderr) # 写摘要 summary = { "name": name, "user_id": user.get("user_id", ""), "open_id": user.get("open_id", ""), "department": user.get("department_path", []), "collected_at": datetime.now(timezone.utc).isoformat(), "files": results, } (output_dir / "collection_summary.json").write_text( json.dumps(summary, ensure_ascii=False, indent=2) ) print(f"\n✅ 采集完成,输出目录:{output_dir}", file=sys.stderr) return results def main() -> None: parser = argparse.ArgumentParser(description="飞书数据自动采集器") parser.add_argument("--setup", action="store_true", help="初始化配置") parser.add_argument("--name", help="同事姓名") parser.add_argument("--output-dir", default=None, help="输出目录(默认 ./knowledge/{name})") parser.add_argument("--msg-limit", type=int, default=1000, help="最多采集消息条数(默认 1000)") parser.add_argument("--doc-limit", type=int, default=20, help="最多采集文档篇数(默认 20)") args = parser.parse_args() if args.setup: setup_config() return if not args.name: parser.error("请提供 --name") config = load_config() output_dir = Path(args.output_dir) if args.output_dir else Path(f"./knowledge/{args.name}") collect_all( name=args.name, output_dir=output_dir, msg_limit=args.msg_limit, doc_limit=args.doc_limit, config=config, ) if __name__ == "__main__": main()