From 8303f25906f4c1ae631486af62224ce3579a83ac Mon Sep 17 00:00:00 2001
From: TevinClaw <510129976@qq.com>
Date: Mon, 16 Mar 2026 07:16:59 +0800
Subject: [PATCH] 优化三层记忆管理每日总结的 session 聚合能力
---
workspace/skills/memory-management/scripts/daily_check.py | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
1 files changed, 162 insertions(+), 11 deletions(-)
diff --git a/workspace/skills/memory-management/scripts/daily_check.py b/workspace/skills/memory-management/scripts/daily_check.py
index 1e880cc..32443c0 100755
--- a/workspace/skills/memory-management/scripts/daily_check.py
+++ b/workspace/skills/memory-management/scripts/daily_check.py
@@ -128,10 +128,48 @@
return None
+def parse_timestamp(ts: any) -> Optional[datetime]:
+ """
+ 解析各种格式的时间戳为 datetime 对象
+ 支持 ISO 8601 字符串和毫秒级 Unix 时间戳
+ """
+ if not ts:
+ return None
+
+ # 如果是数字(毫秒级 Unix 时间戳)
+ if isinstance(ts, (int, float)):
+ # 毫秒转秒
+ ts_sec = ts / 1000 if ts > 1e10 else ts
+ try:
+ return datetime.fromtimestamp(ts_sec)
+ except (ValueError, OSError):
+ return None
+
+ # 如果是字符串(ISO 8601 格式)
+ if isinstance(ts, str):
+ try:
+ # 处理带 Z 的 UTC 时间
+ ts = ts.replace('Z', '+00:00')
+ # Python 3.7+ 支持 fromisoformat
+ from datetime import timezone
+ dt = datetime.fromisoformat(ts)
+ # 转换为本地时间
+ if dt.tzinfo is not None:
+ dt = dt.replace(tzinfo=None)
+ return dt
+ except (ValueError, TypeError):
+ return None
+
+ return None
+
+
def extract_messages_from_session(file_info: Dict) -> List[Dict]:
"""
从 session 文件中提取所有真实用户消息
- 增强版:过滤系统消息,提取实际用户内容
+ 优化版:
+ 1. 正确解析消息时间戳(而非使用文件修改时间)
+ 2. 提取飞书消息中的真实发送时间
+ 3. 改进内容去重和过滤
"""
messages = []
file_path = file_info['path']
@@ -172,13 +210,55 @@
# 提取真实用户内容(过滤系统消息)
user_content = extract_user_content(text)
- if user_content:
- messages.append({
- 'timestamp': record.get("timestamp", ""),
- 'content': user_content[:400], # 限制长度
- 'session': session_name,
- 'session_time': file_info['mtime'].strftime('%H:%M:%S')
- })
+ if not user_content:
+ break
+
+ # 解析时间戳 - 优先级:
+ # 1. record 级别的时间戳(ISO 8601)
+ # 2. message 内部的 timestamp(毫秒 Unix)
+ # 3. 从飞书消息文本中提取时间
+ # 4. 最后使用文件修改时间
+
+ msg_time = None
+ time_source = "unknown"
+
+ # 尝试从 record 获取时间戳
+ record_ts = record.get("timestamp")
+ if record_ts:
+ msg_time = parse_timestamp(record_ts)
+ if msg_time:
+ time_source = "record"
+
+ # 尝试从 message 内部获取时间戳(毫秒 Unix)
+ if not msg_time and "timestamp" in msg:
+ msg_time = parse_timestamp(msg.get("timestamp"))
+ if msg_time:
+ time_source = "message"
+
+ # 尝试从飞书消息文本中提取时间
+ if not msg_time:
+ feishu_time_match = re.search(r'\[(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})', text)
+ if feishu_time_match:
+ try:
+ msg_time = datetime.strptime(feishu_time_match.group(1), "%Y-%m-%d %H:%M:%S")
+ time_source = "feishu_text"
+ except ValueError:
+ pass
+
+ # 最后使用文件修改时间
+ if not msg_time:
+ msg_time = file_info['mtime']
+ time_source = "file_mtime"
+
+ messages.append({
+ 'timestamp': msg_time.isoformat() if msg_time else "",
+ 'timestamp_dt': msg_time,
+ 'content': user_content[:400], # 限制长度
+ 'session': session_name,
+ 'time_str': msg_time.strftime('%H:%M:%S') if msg_time else 'unknown',
+ 'time_source': time_source,
+ 'content_hash': hash(user_content[:100]) # 用于去重
+ })
break
except json.JSONDecodeError:
@@ -190,19 +270,90 @@
return messages
+def deduplicate_messages(messages: List[Dict]) -> List[Dict]:
+ """
+ 对跨 session 的消息进行去重
+ 基于内容哈希和时间窗口判断是否为重复消息
+ """
+ if not messages:
+ return []
+
+ # 先按时间排序
+ messages.sort(key=lambda x: x.get('timestamp_dt') or datetime.min)
+
+ deduped = []
+ seen_hashes = {} # hash -> (timestamp, content_preview)
+
+ # 时间窗口:5分钟内相同内容的视为重复
+ time_window = timedelta(minutes=5)
+
+ for msg in messages:
+ content_hash = msg.get('content_hash')
+ msg_time = msg.get('timestamp_dt')
+
+ if content_hash is None:
+ deduped.append(msg)
+ continue
+
+ # 检查是否已有相似消息
+ is_duplicate = False
+ if content_hash in seen_hashes:
+ last_time, last_preview = seen_hashes[content_hash]
+ if msg_time and last_time:
+ if abs((msg_time - last_time).total_seconds()) < time_window.total_seconds():
+ is_duplicate = True
+ # 保留更详细的消息(更长的内容)
+ if len(msg.get('content', '')) > len(last_preview):
+ # 替换之前的消息
+ for i, existing in enumerate(deduped):
+ if existing.get('content_hash') == content_hash:
+ deduped[i] = msg
+ seen_hashes[content_hash] = (msg_time, msg.get('content', '')[:100])
+ break
+
+ if not is_duplicate:
+ deduped.append(msg)
+ if content_hash:
+ seen_hashes[content_hash] = (msg_time, msg.get('content', '')[:100])
+
+ return deduped
+
+
def aggregate_messages_across_sessions(session_files: List[Dict]) -> List[Dict]:
"""
跨 session 聚合所有消息,按时间排序
- 这是解决 session 分割问题的关键函数
+ 优化版:
+ 1. 正确解析每条消息的真实时间戳
+ 2. 跨 session 去重(处理 session 重置导致的重复消息)
+ 3. 重建完整时间线
"""
all_messages = []
+ print(f"\n 正在处理 {len(session_files)} 个 session 文件...")
+
for file_info in session_files:
messages = extract_messages_from_session(file_info)
- all_messages.extend(messages)
+ if messages:
+ all_messages.extend(messages)
+ # 显示时间源统计
+ time_sources = {}
+ for m in messages:
+ src = m.get('time_source', 'unknown')
+ time_sources[src] = time_sources.get(src, 0) + 1
+ print(f" 📄 {file_info['name'][:30]}...: {len(messages)} 条消息")
+ for src, count in time_sources.items():
+ print(f" └─ {src}: {count}")
+
+ if not all_messages:
+ return []
+
+ # 去重(处理 session 重置导致的重复)
+ print(f"\n 🔄 原始消息数: {len(all_messages)}")
+ all_messages = deduplicate_messages(all_messages)
+ print(f" ✅ 去重后消息数: {len(all_messages)}")
# 按时间戳排序,重建完整时间线
- all_messages.sort(key=lambda x: x.get('timestamp', ''))
+ all_messages.sort(key=lambda x: x.get('timestamp_dt') or datetime.min)
return all_messages
--
Gitblit v1.9.1