From 3b60ae9ecfd15eb75d8ef4bc4a3c4b617e8eb1c2 Mon Sep 17 00:00:00 2001 From: "qichi.liang" Date: Sun, 28 Dec 2025 23:31:22 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20=E6=A8=A1=E5=9D=97=E5=8C=96?= =?UTF-8?q?=E9=87=8D=E6=9E=84=E9=A1=B9=E7=9B=AE=E7=BB=93=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 20 ++++ AGENTS.md | 67 ++++++++++++++ fetch_and_process.py | 75 +++++++++++++++ src/__init__.py | 11 +++ src/confluence.py | 68 ++++++++++++++ src/database.py | 154 ++++++++++++++++++++++++++++++ src/extractor.py | 216 +++++++++++++++++++++++++++++++++++++++++++ src/parser.py | 144 +++++++++++++++++++++++++++++ 8 files changed, 755 insertions(+) create mode 100644 .gitignore create mode 100644 AGENTS.md create mode 100644 fetch_and_process.py create mode 100644 src/__init__.py create mode 100644 src/confluence.py create mode 100644 src/database.py create mode 100644 src/extractor.py create mode 100644 src/parser.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6573c82 --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +# Python +__pycache__/ +*.py[cod] +*.egg-info/ +build/ +dist/ + +# Database +data/daily_logs.db + +# Cache +*.pyc +*.pyo + +# OS +.DS_Store +Thumbs.db + +# IDE +.vscode/ diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..af6bcdd --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,67 @@ +# AGENTS.md + +This file provides guidance to agents when working with code in this repository. + +## 项目概述 +Python 工具,用于从 Confluence API 获取 HTML 并提取保留布局的文本。 + +## 项目结构 + +``` +OrbitIn/ +├── src/ # 代码模块目录 +│ ├── __init__.py # 包初始化 +│ ├── confluence.py # Confluence API 客户端 +│ ├── extractor.py # HTML 文本提取器 +│ ├── parser.py # 日志解析器 +│ └── database.py # SQLite3 数据库操作 +├── data/ # 数据目录 +│ └── daily_logs.db # SQLite3 数据库文件 +├── fetch_and_process.py # CLI 入口 +├── AGENTS.md # AI助手文档 +└── layout_output.txt # 缓存的布局文本 +``` + +## 核心模块 + +### [`ConfluenceClient`](src/confluence.py:9) +- `fetch_content(content_id, expand)` - 获取页面内容 +- `get_html(content_id)` - 获取 HTML 字符串 + +### [`HTMLTextExtractor`](src/extractor.py:12) +- `extract(html)` - 从 HTML 提取保留布局的文本 +- 使用 `html.parser`(非 lxml) +- 移除带 `ac:name` 属性的 Confluence 宏元素 +- 表格格式化使用 `ljust()` 列对齐 + +### [`HandoverLogParser`](src/parser.py:18) +- `parse(text)` - 解析日志文本,返回 `ShipLog` 列表 +- `ShipLog` 数据类:date, shift, ship_name, teu, efficiency, vehicles + +### [`DailyLogsDatabase`](src/database.py:13) +- `insert(log)` - 插入单条记录 +- `insert_many(logs)` - 批量插入 +- `query_by_date(date)` - 按日期查询 +- `query_by_ship(ship_name)` - 按船名查询 +- `query_all(limit)` - 查询所有 +- `get_stats()` - 获取统计信息 + +## 文本格式约定 + +- 列表前缀:`•` 用于 `ul`,数字+点用于 `ol` +- 粗体使用 `**text**`,斜体使用 `*text*` +- 水平线使用 `─` (U+2500) 字符 +- 链接渲染为 `text (url)` + +## 命令 + +```bash +# 带数据库存储运行(默认) +python3 fetch_and_process.py + +# 不存储到数据库 +python3 fetch_and_process.py --no-db + +# 测试解析模块 +python3 -c "from src.parser import HandoverLogParser; p = HandoverLogParser(); print(p.parse(open('layout_output.txt').read())[:3])" +``` diff --git a/fetch_and_process.py b/fetch_and_process.py new file mode 100644 index 0000000..49fd979 --- /dev/null +++ b/fetch_and_process.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +""" +从 Confluence 获取交接班日志并保存到数据库 +""" +import argparse +import sys + +from src.confluence import ConfluenceClient +from src.extractor import HTMLTextExtractor +from src.parser import HandoverLogParser +from src.database import DailyLogsDatabase + + +def run(save_db: bool = True): + """运行主流程""" + # 配置 + CONTENT_ID = '155764524' + BASE_URL = 'https://confluence.westwell-lab.com/rest/api' + TOKEN = 'NDE1NTcwMDE1ODQ0OiinqS5HLm12v2orWEYyjJcI1bl5' + + print('正在从 Confluence 获取 HTML 内容...') + + # 获取 HTML + client = ConfluenceClient(BASE_URL, TOKEN) + html = client.get_html(CONTENT_ID) + + if not html: + print('错误:未获取到 HTML 内容') + sys.exit(1) + + print('正在提取布局文本...') + + # 提取文本 + extractor = HTMLTextExtractor() + layout_text = extractor.extract(html) + + print(f'\n提取完成,共 {len(layout_text)} 字符\n') + + # 保存到文件(可选) + with open('layout_output.txt', 'w', encoding='utf-8') as f: + f.write(layout_text) + print('布局文本已保存到 layout_output.txt') + + # 保存到数据库(可选) + if save_db: + print('\n正在解析日志数据...') + + parser = HandoverLogParser() + logs = parser.parse(layout_text) + + if not logs: + print('未解析到任何记录') + return + + print(f'解析到 {len(logs)} 条记录') + + db = DailyLogsDatabase() + count = db.insert_many([log.to_dict() for log in logs]) + print(f'已保存 {count} 条记录到数据库') + + stats = db.get_stats() + print(f'\n数据库统计:') + print(f' 总记录: {stats["total"]}') + print(f' 船次: {len(stats["ships"])}') + print(f' 日期范围: {stats["date_range"]["start"]} ~ {stats["date_range"]["end"]}') + + db.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='从 Confluence 获取交接班日志') + parser.add_argument('--no-db', action='store_true', help='不保存到数据库') + args = parser.parse_args() + + run(save_db=not args.no_db) diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..eeda25f --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 +""" +OrbitIn - Confluence 日志抓取与处理工具包 +""" +from .confluence import ConfluenceClient +from .extractor import HTMLTextExtractor +from .parser import HandoverLogParser +from .database import DailyLogsDatabase + +__version__ = '1.0.0' +__all__ = ['ConfluenceClient', 'HTMLTextExtractor', 'HandoverLogParser', 'DailyLogsDatabase'] diff --git a/src/confluence.py b/src/confluence.py new file mode 100644 index 0000000..37fbaa6 --- /dev/null +++ b/src/confluence.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +""" +Confluence API 客户端模块 +""" +import requests +from typing import Optional + + +class ConfluenceClient: + """Confluence REST API 客户端""" + + def __init__(self, base_url: str, token: str): + """ + 初始化客户端 + + 参数: + base_url: Confluence API 基础URL (不包含 /content) + token: Bearer 认证令牌 + """ + self.base_url = base_url.rstrip('/') + self.headers = { + 'Authorization': f'Bearer {token}', + 'Accept': 'application/json' + } + + def fetch_content(self, content_id: str, expand: str = 'body.storage') -> dict: + """ + 获取页面内容 + + 参数: + content_id: 页面ID + expand: 展开字段 + + 返回: + API 响应数据 + """ + url = f'{self.base_url}/content/{content_id}' + params = {'expand': expand} + + response = requests.get(url, headers=self.headers, params=params, timeout=30) + response.raise_for_status() + return response.json() + + def get_html(self, content_id: str) -> str: + """ + 获取页面HTML内容 + + 参数: + content_id: 页面ID + + 返回: + HTML 字符串 + """ + data = self.fetch_content(content_id) + return data.get('body', {}).get('storage', {}).get('value', '') + + +if __name__ == '__main__': + # 使用示例 + import os + + client = ConfluenceClient( + base_url='https://confluence.westwell-lab.com/rest/api', + token=os.getenv('CONFLUENCE_TOKEN', '') + ) + + html = client.get_html('155764524') + print(f'获取到 {len(html)} 字符的HTML内容') diff --git a/src/database.py b/src/database.py new file mode 100644 index 0000000..f7895fa --- /dev/null +++ b/src/database.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +数据库模块 +""" +import sqlite3 +import os +from datetime import datetime +from typing import List, Dict, Optional + + +class DailyLogsDatabase: + """每日交接班日志数据库""" + + def __init__(self, db_path: str = 'data/daily_logs.db'): + """ + 初始化数据库 + + 参数: + db_path: 数据库文件路径 + """ + self.db_path = db_path + self._ensure_directory() + self.conn = self._connect() + self._init_schema() + + def _ensure_directory(self): + """确保数据目录存在""" + data_dir = os.path.dirname(self.db_path) + if data_dir and not os.path.exists(data_dir): + os.makedirs(data_dir) + + def _connect(self) -> sqlite3.Connection: + """连接数据库""" + conn = sqlite3.connect(self.db_path) + conn.row_factory = sqlite3.Row + return conn + + def _init_schema(self): + """初始化表结构""" + cursor = self.conn.cursor() + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS daily_handover_logs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + date TEXT NOT NULL, + shift TEXT NOT NULL, + ship_name TEXT NOT NULL, + teu INTEGER, + efficiency REAL, + vehicles INTEGER, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + UNIQUE(date, shift, ship_name) + ) + ''') + + # 索引 + cursor.execute('CREATE INDEX IF NOT EXISTS idx_date ON daily_handover_logs(date)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_ship ON daily_handover_logs(ship_name)') + + self.conn.commit() + + def insert(self, log: Dict) -> bool: + """插入单条记录""" + try: + cursor = self.conn.cursor() + cursor.execute(''' + INSERT OR REPLACE INTO daily_handover_logs + (date, shift, ship_name, teu, efficiency, vehicles) + VALUES (?, ?, ?, ?, ?, ?) + ''', ( + log['date'], log['shift'], log['ship_name'], + log.get('teu'), log.get('efficiency'), log.get('vehicles') + )) + self.conn.commit() + return True + except sqlite3.Error: + return False + + def insert_many(self, logs: List[Dict]) -> int: + """批量插入""" + count = 0 + for log in logs: + if self.insert(log): + count += 1 + return count + + def query_by_date(self, date: str) -> List[Dict]: + """按日期查询""" + cursor = self.conn.cursor() + cursor.execute(''' + SELECT * FROM daily_handover_logs + WHERE date = ? ORDER BY shift, ship_name + ''', (date,)) + return [dict(row) for row in cursor.fetchall()] + + def query_by_ship(self, ship_name: str) -> List[Dict]: + """按船名查询""" + cursor = self.conn.cursor() + cursor.execute(''' + SELECT * FROM daily_handover_logs + WHERE ship_name LIKE ? ORDER BY date DESC + ''', (f'%{ship_name}%',)) + return [dict(row) for row in cursor.fetchall()] + + def query_all(self, limit: int = 1000) -> List[Dict]: + """查询所有""" + cursor = self.conn.cursor() + cursor.execute(''' + SELECT * FROM daily_handover_logs + ORDER BY date DESC, shift LIMIT ? + ''', (limit,)) + return [dict(row) for row in cursor.fetchall()] + + def get_stats(self) -> Dict: + """获取统计信息""" + cursor = self.conn.cursor() + + cursor.execute('SELECT COUNT(*) FROM daily_handover_logs') + total = cursor.fetchone()[0] + + cursor.execute('SELECT DISTINCT ship_name FROM daily_handover_logs') + ships = [row[0] for row in cursor.fetchall()] + + cursor.execute('SELECT MIN(date), MAX(date) FROM daily_handover_logs') + date_range = cursor.fetchone() + + return { + 'total': total, + 'ships': ships, + 'date_range': {'start': date_range[0], 'end': date_range[1]} + } + + def close(self): + """关闭连接""" + if self.conn: + self.conn.close() + + +if __name__ == '__main__': + db = DailyLogsDatabase() + + # 测试插入 + test_log = { + 'date': '2025-12-28', + 'shift': '白班', + 'ship_name': '测试船', + 'teu': 100, + 'efficiency': 3.5, + 'vehicles': 5 + } + + db.insert(test_log) + print(f'总记录: {db.get_stats()["total"]}') + db.close() diff --git a/src/extractor.py b/src/extractor.py new file mode 100644 index 0000000..042b846 --- /dev/null +++ b/src/extractor.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +""" +HTML 文本提取模块 +""" +import re +from bs4 import BeautifulSoup, Tag, NavigableString +from typing import List + + +class HTMLTextExtractor: + """HTML 文本提取器 - 保留布局结构""" + + # 块级元素列表 + BLOCK_TAGS = { + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'section', + 'table', 'tr', 'td', 'th', 'li', 'ul', 'ol', 'blockquote', + 'pre', 'hr', 'br', 'tbody', 'thead', 'tfoot' + } + + def __init__(self): + """初始化提取器""" + self.output_lines: List[str] = [] + + def extract(self, html: str) -> str: + """ + 从HTML中提取保留布局的文本 + + 参数: + html: HTML字符串 + + 返回: + 格式化的纯文本 + """ + if not html: + return '' + + soup = BeautifulSoup(html, 'html.parser') + + # 移除不需要的元素 + for tag in soup(["script", "style", "noscript"]): + tag.decompose() + + # 移除 Confluence 宏 + for macro in soup.find_all(attrs={"ac:name": True}): + macro.decompose() + + self.output_lines = [] + + # 处理 body 或整个文档 + body = soup.body if soup.body else soup + for child in body.children: + self._process_node(child) + + # 清理结果 + result = ''.join(self.output_lines) + result = re.sub(r'\n\s*\n\s*\n', '\n\n', result) + result = '\n'.join(line.rstrip() for line in result.split('\n')) + return result.strip() + + def _process_node(self, node, indent: int = 0, list_context=None): + """递归处理节点""" + if isinstance(node, NavigableString): + text = str(node).strip() + if text: + text = re.sub(r'\s+', ' ', text) + if self.output_lines and not self.output_lines[-1].endswith('\n'): + self.output_lines[-1] += text + else: + self.output_lines.append(' ' * indent + text) + return + + if not isinstance(node, Tag): + return + + tag_name = node.name.lower() + is_block = tag_name in self.BLOCK_TAGS + + # 块级元素前添加换行 + if is_block and self.output_lines and not self.output_lines[-1].endswith('\n'): + self.output_lines.append('\n') + + # 处理特定标签 + if tag_name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): + level = int(tag_name[1]) + prefix = '#' * level + ' ' + text = node.get_text().strip() + if text: + self.output_lines.append(' ' * indent + prefix + text + '\n') + return + + elif tag_name == 'p': + text = node.get_text().strip() + if text: + self.output_lines.append(' ' * indent + text + '\n') + return + + elif tag_name == 'hr': + self.output_lines.append(' ' * indent + '─' * 50 + '\n') + return + + elif tag_name == 'br': + self.output_lines.append('\n') + return + + elif tag_name == 'table': + self._process_table(node, indent) + return + + elif tag_name in ('ul', 'ol'): + self._process_list(node, indent, tag_name) + return + + elif tag_name == 'li': + self._process_list_item(node, indent, list_context) + return + + elif tag_name == 'a': + href = node.get('href', '') + text = node.get_text().strip() + if href and text: + self.output_lines.append(f'{text} ({href})') + elif text: + self.output_lines.append(text) + return + + elif tag_name in ('strong', 'b'): + text = node.get_text().strip() + if text: + self.output_lines.append(f'**{text}**') + return + + elif tag_name in ('em', 'i'): + text = node.get_text().strip() + if text: + self.output_lines.append(f'*{text}*') + return + + else: + # 默认递归处理子元素 + for child in node.children: + self._process_node(child, indent, list_context) + + if is_block and self.output_lines and not self.output_lines[-1].endswith('\n'): + self.output_lines.append('\n') + + def _process_table(self, table: Tag, indent: int): + """处理表格""" + rows = [] + for tr in table.find_all('tr'): + row = [] + for td in tr.find_all(['td', 'th']): + row.append(td.get_text().strip()) + if row: + rows.append(row) + + if rows: + # 计算列宽 + col_widths = [] + for i in range(max(len(r) for r in rows)): + col_width = max((len(r[i]) if i < len(r) else 0) for r in rows) + col_widths.append(col_width) + + for row in rows: + line = ' ' * indent + for i, cell in enumerate(row): + width = col_widths[i] if i < len(col_widths) else 0 + line += cell.ljust(width) + ' ' + self.output_lines.append(line.rstrip() + '\n') + self.output_lines.append('\n') + + def _process_list(self, ul: Tag, indent: int, list_type: str): + """处理列表""" + counter = 1 if list_type == 'ol' else None + for child in ul.children: + if isinstance(child, Tag) and child.name == 'li': + ctx = (list_type, counter) if counter else (list_type, 1) + self._process_list_item(child, indent, ctx) + if counter: + counter += 1 + else: + self._process_node(child, indent, (list_type, 1) if not counter else None) + + def _process_list_item(self, li: Tag, indent: int, list_context): + """处理列表项""" + prefix = '' + if list_context: + list_type, num = list_context + prefix = '• ' if list_type == 'ul' else f'{num}. ' + + # 收集直接文本 + direct_parts = [] + for child in li.children: + if isinstance(child, NavigableString): + text = str(child).strip() + if text: + direct_parts.append(text) + elif isinstance(child, Tag) and child.name == 'a': + href = child.get('href', '') + link_text = child.get_text().strip() + if href and link_text: + direct_parts.append(f'{link_text} ({href})') + + if direct_parts: + self.output_lines.append(' ' * indent + prefix + ' '.join(direct_parts) + '\n') + + # 处理子元素 + for child in li.children: + if isinstance(child, Tag) and child.name != 'a': + self._process_node(child, indent + 2, None) + + +if __name__ == '__main__': + # 测试 + html = "

标题

段落

" + extractor = HTMLTextExtractor() + print(extractor.extract(html)) diff --git a/src/parser.py b/src/parser.py new file mode 100644 index 0000000..da12737 --- /dev/null +++ b/src/parser.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +日志解析模块 +""" +import re +from typing import List, Dict, Optional +from dataclasses import dataclass + + +@dataclass +class ShipLog: + """船次日志数据类""" + date: str + shift: str + ship_name: str + teu: Optional[int] = None + efficiency: Optional[float] = None + vehicles: Optional[int] = None + + def to_dict(self) -> Dict: + """转换为字典""" + return { + 'date': self.date, + 'shift': self.shift, + 'ship_name': self.ship_name, + 'teu': self.teu, + 'efficiency': self.efficiency, + 'vehicles': self.vehicles + } + + +class HandoverLogParser: + """交接班日志解析器""" + + SEPARATOR = '———————————————————————————————————————————————' + + def __init__(self): + """初始化解析器""" + pass + + @staticmethod + def parse_date(date_str: str) -> str: + """解析日期字符串""" + try: + parts = date_str.split('.') + if len(parts) == 3: + return f"{parts[0]}-{parts[1]}-{parts[2]}" + return date_str + except Exception: + return date_str + + def parse(self, text: str) -> List[ShipLog]: + """ + 解析日志文本 + + 参数: + text: 日志文本 + + 返回: + 船次日志列表 + """ + logs = [] + blocks = text.split(self.SEPARATOR) + + for block in blocks: + if not block.strip() or '日期:' not in block: + continue + + # 解析日期 + date_match = re.search(r'日期:(\d{4}\.\d{2}\.\d{2})', block) + if not date_match: + continue + + date = self.parse_date(date_match.group(1)) + self._parse_block(block, date, logs) + + return logs + + def _parse_block(self, block: str, date: str, logs: List[ShipLog]): + """解析日期块""" + for shift in ['白班', '夜班']: + shift_pattern = f'{shift}:' + if shift_pattern not in block: + continue + + shift_start = block.find(shift_pattern) + len(shift_pattern) + + # 找到下一个班次或注意事项 + next_pos = len(block) + for next_shift in ['白班', '夜班']: + if next_shift != shift: + pos = block.find(f'{next_shift}:', shift_start) + if pos != -1 and pos < next_pos: + next_pos = pos + + notes_pos = block.find('注意事项:', shift_start) + if notes_pos != -1 and notes_pos < next_pos: + next_pos = notes_pos + + shift_content = block[shift_start:next_pos] + self._parse_ships(shift_content, date, shift, logs) + + def _parse_ships(self, content: str, date: str, shift: str, logs: List[ShipLog]): + """解析船次""" + parts = content.split('实船作业:') + + for part in parts: + if not part.strip(): + continue + + cleaned = part.replace('\xa0', ' ').strip() + ship_match = re.search(r'#\s+(\S+)', cleaned) + + if not ship_match: + continue + + ship_name = ship_match.group(1) + vehicles_match = re.search(r'上场车辆数:(\d+)', cleaned) + teu_eff_match = re.search( + r'作业量/效率:(\d+)TEU[,,\s]+([\d.]+)循环/车/小时', cleaned + ) + + log = ShipLog( + date=date, + shift=shift, + ship_name=ship_name, + teu=int(teu_eff_match.group(1)) if teu_eff_match else None, + efficiency=float(teu_eff_match.group(2)) if teu_eff_match else None, + vehicles=int(vehicles_match.group(1)) if vehicles_match else None + ) + logs.append(log) + + +if __name__ == '__main__': + # 测试 + with open('layout_output.txt', 'r', encoding='utf-8') as f: + text = f.read() + + parser = HandoverLogParser() + logs = parser.parse(text) + + print(f'解析到 {len(logs)} 条记录') + for log in logs[:5]: + print(f'{log.date} {log.shift} {log.ship_name}: {log.teu}TEU')