| | |
| | | return None |
| | | |
| | | |
| | | def parse_timestamp(ts: any) -> Optional[datetime]: |
| | | """ |
| | | 解析各种格式的时间戳为 datetime 对象 |
| | | 支持 ISO 8601 字符串和毫秒级 Unix 时间戳 |
| | | """ |
| | | if not ts: |
| | | return None |
| | | |
| | | # 如果是数字(毫秒级 Unix 时间戳) |
| | | if isinstance(ts, (int, float)): |
| | | # 毫秒转秒 |
| | | ts_sec = ts / 1000 if ts > 1e10 else ts |
| | | try: |
| | | return datetime.fromtimestamp(ts_sec) |
| | | except (ValueError, OSError): |
| | | return None |
| | | |
| | | # 如果是字符串(ISO 8601 格式) |
| | | if isinstance(ts, str): |
| | | try: |
| | | # 处理带 Z 的 UTC 时间 |
| | | ts = ts.replace('Z', '+00:00') |
| | | # Python 3.7+ 支持 fromisoformat |
| | | from datetime import timezone |
| | | dt = datetime.fromisoformat(ts) |
| | | # 转换为本地时间 |
| | | if dt.tzinfo is not None: |
| | | dt = dt.replace(tzinfo=None) |
| | | return dt |
| | | except (ValueError, TypeError): |
| | | return None |
| | | |
| | | return None |
| | | |
| | | |
| | | def extract_messages_from_session(file_info: Dict) -> List[Dict]: |
| | | """ |
| | | 从 session 文件中提取所有真实用户消息 |
| | | 增强版:过滤系统消息,提取实际用户内容 |
| | | 优化版: |
| | | 1. 正确解析消息时间戳(而非使用文件修改时间) |
| | | 2. 提取飞书消息中的真实发送时间 |
| | | 3. 改进内容去重和过滤 |
| | | """ |
| | | messages = [] |
| | | file_path = file_info['path'] |
| | |
| | | |
| | | # 提取真实用户内容(过滤系统消息) |
| | | user_content = extract_user_content(text) |
| | | if user_content: |
| | | messages.append({ |
| | | 'timestamp': record.get("timestamp", ""), |
| | | 'content': user_content[:400], # 限制长度 |
| | | 'session': session_name, |
| | | 'session_time': file_info['mtime'].strftime('%H:%M:%S') |
| | | }) |
| | | if not user_content: |
| | | break |
| | | |
| | | # 解析时间戳 - 优先级: |
| | | # 1. record 级别的时间戳(ISO 8601) |
| | | # 2. message 内部的 timestamp(毫秒 Unix) |
| | | # 3. 从飞书消息文本中提取时间 |
| | | # 4. 最后使用文件修改时间 |
| | | |
| | | msg_time = None |
| | | time_source = "unknown" |
| | | |
| | | # 尝试从 record 获取时间戳 |
| | | record_ts = record.get("timestamp") |
| | | if record_ts: |
| | | msg_time = parse_timestamp(record_ts) |
| | | if msg_time: |
| | | time_source = "record" |
| | | |
| | | # 尝试从 message 内部获取时间戳(毫秒 Unix) |
| | | if not msg_time and "timestamp" in msg: |
| | | msg_time = parse_timestamp(msg.get("timestamp")) |
| | | if msg_time: |
| | | time_source = "message" |
| | | |
| | | # 尝试从飞书消息文本中提取时间 |
| | | if not msg_time: |
| | | feishu_time_match = re.search(r'\[(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})', text) |
| | | if feishu_time_match: |
| | | try: |
| | | msg_time = datetime.strptime(feishu_time_match.group(1), "%Y-%m-%d %H:%M:%S") |
| | | time_source = "feishu_text" |
| | | except ValueError: |
| | | pass |
| | | |
| | | # 最后使用文件修改时间 |
| | | if not msg_time: |
| | | msg_time = file_info['mtime'] |
| | | time_source = "file_mtime" |
| | | |
| | | messages.append({ |
| | | 'timestamp': msg_time.isoformat() if msg_time else "", |
| | | 'timestamp_dt': msg_time, |
| | | 'content': user_content[:400], # 限制长度 |
| | | 'session': session_name, |
| | | 'time_str': msg_time.strftime('%H:%M:%S') if msg_time else 'unknown', |
| | | 'time_source': time_source, |
| | | 'content_hash': hash(user_content[:100]) # 用于去重 |
| | | }) |
| | | break |
| | | |
| | | except json.JSONDecodeError: |
| | |
| | | return messages |
| | | |
| | | |
| | | def deduplicate_messages(messages: List[Dict]) -> List[Dict]: |
| | | """ |
| | | 对跨 session 的消息进行去重 |
| | | 基于内容哈希和时间窗口判断是否为重复消息 |
| | | """ |
| | | if not messages: |
| | | return [] |
| | | |
| | | # 先按时间排序 |
| | | messages.sort(key=lambda x: x.get('timestamp_dt') or datetime.min) |
| | | |
| | | deduped = [] |
| | | seen_hashes = {} # hash -> (timestamp, content_preview) |
| | | |
| | | # 时间窗口:5分钟内相同内容的视为重复 |
| | | time_window = timedelta(minutes=5) |
| | | |
| | | for msg in messages: |
| | | content_hash = msg.get('content_hash') |
| | | msg_time = msg.get('timestamp_dt') |
| | | |
| | | if content_hash is None: |
| | | deduped.append(msg) |
| | | continue |
| | | |
| | | # 检查是否已有相似消息 |
| | | is_duplicate = False |
| | | if content_hash in seen_hashes: |
| | | last_time, last_preview = seen_hashes[content_hash] |
| | | if msg_time and last_time: |
| | | if abs((msg_time - last_time).total_seconds()) < time_window.total_seconds(): |
| | | is_duplicate = True |
| | | # 保留更详细的消息(更长的内容) |
| | | if len(msg.get('content', '')) > len(last_preview): |
| | | # 替换之前的消息 |
| | | for i, existing in enumerate(deduped): |
| | | if existing.get('content_hash') == content_hash: |
| | | deduped[i] = msg |
| | | seen_hashes[content_hash] = (msg_time, msg.get('content', '')[:100]) |
| | | break |
| | | |
| | | if not is_duplicate: |
| | | deduped.append(msg) |
| | | if content_hash: |
| | | seen_hashes[content_hash] = (msg_time, msg.get('content', '')[:100]) |
| | | |
| | | return deduped |
| | | |
| | | |
| | | def aggregate_messages_across_sessions(session_files: List[Dict]) -> List[Dict]: |
| | | """ |
| | | 跨 session 聚合所有消息,按时间排序 |
| | | 这是解决 session 分割问题的关键函数 |
| | | 优化版: |
| | | 1. 正确解析每条消息的真实时间戳 |
| | | 2. 跨 session 去重(处理 session 重置导致的重复消息) |
| | | 3. 重建完整时间线 |
| | | """ |
| | | all_messages = [] |
| | | |
| | | print(f"\n 正在处理 {len(session_files)} 个 session 文件...") |
| | | |
| | | for file_info in session_files: |
| | | messages = extract_messages_from_session(file_info) |
| | | all_messages.extend(messages) |
| | | if messages: |
| | | all_messages.extend(messages) |
| | | # 显示时间源统计 |
| | | time_sources = {} |
| | | for m in messages: |
| | | src = m.get('time_source', 'unknown') |
| | | time_sources[src] = time_sources.get(src, 0) + 1 |
| | | print(f" 📄 {file_info['name'][:30]}...: {len(messages)} 条消息") |
| | | for src, count in time_sources.items(): |
| | | print(f" └─ {src}: {count}") |
| | | |
| | | if not all_messages: |
| | | return [] |
| | | |
| | | # 去重(处理 session 重置导致的重复) |
| | | print(f"\n 🔄 原始消息数: {len(all_messages)}") |
| | | all_messages = deduplicate_messages(all_messages) |
| | | print(f" ✅ 去重后消息数: {len(all_messages)}") |
| | | |
| | | # 按时间戳排序,重建完整时间线 |
| | | all_messages.sort(key=lambda x: x.get('timestamp', '')) |
| | | all_messages.sort(key=lambda x: x.get('timestamp_dt') or datetime.min) |
| | | |
| | | return all_messages |
| | | |