refactor: 模块化重构项目结构

2026-02-10 15:41:31 +08:00 · 2025-12-28 23:31:22 +08:00
commit 3b60ae9ecf
8 changed files with 755 additions and 0 deletions
--- a/src/init.py
+++ b/src/init.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python3
+"""
+OrbitIn - Confluence 日志抓取与处理工具包
+"""
+from .confluence import ConfluenceClient
+from .extractor import HTMLTextExtractor
+from .parser import HandoverLogParser
+from .database import DailyLogsDatabase
+
+__version__ = '1.0.0'
+__all__ = ['ConfluenceClient', 'HTMLTextExtractor', 'HandoverLogParser', 'DailyLogsDatabase']
--- a/src/confluence.py
+++ b/src/confluence.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+"""
+Confluence API 客户端模块
+"""
+import requests
+from typing import Optional
+
+
+class ConfluenceClient:
+    """Confluence REST API 客户端"""
+    
+    def __init__(self, base_url: str, token: str):
+        """
+        初始化客户端
+        
+        参数:
+            base_url: Confluence API 基础URL (不包含 /content)
+            token: Bearer 认证令牌
+        """
+        self.base_url = base_url.rstrip('/')
+        self.headers = {
+            'Authorization': f'Bearer {token}',
+            'Accept': 'application/json'
+        }
+    
+    def fetch_content(self, content_id: str, expand: str = 'body.storage') -> dict:
+        """
+        获取页面内容
+        
+        参数:
+            content_id: 页面ID
+            expand: 展开字段
+            
+        返回:
+            API 响应数据
+        """
+        url = f'{self.base_url}/content/{content_id}'
+        params = {'expand': expand}
+        
+        response = requests.get(url, headers=self.headers, params=params, timeout=30)
+        response.raise_for_status()
+        return response.json()
+    
+    def get_html(self, content_id: str) -> str:
+        """
+        获取页面HTML内容
+        
+        参数:
+            content_id: 页面ID
+            
+        返回:
+            HTML 字符串
+        """
+        data = self.fetch_content(content_id)
+        return data.get('body', {}).get('storage', {}).get('value', '')
+
+
+if __name__ == '__main__':
+    # 使用示例
+    import os
+    
+    client = ConfluenceClient(
+        base_url='https://confluence.westwell-lab.com/rest/api',
+        token=os.getenv('CONFLUENCE_TOKEN', '')
+    )
+    
+    html = client.get_html('155764524')
+    print(f'获取到 {len(html)} 字符的HTML内容')
--- a/src/database.py
+++ b/src/database.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""
+数据库模块
+"""
+import sqlite3
+import os
+from datetime import datetime
+from typing import List, Dict, Optional
+
+
+class DailyLogsDatabase:
+    """每日交接班日志数据库"""
+    
+    def __init__(self, db_path: str = 'data/daily_logs.db'):
+        """
+        初始化数据库
+        
+        参数:
+            db_path: 数据库文件路径
+        """
+        self.db_path = db_path
+        self._ensure_directory()
+        self.conn = self._connect()
+        self._init_schema()
+    
+    def _ensure_directory(self):
+        """确保数据目录存在"""
+        data_dir = os.path.dirname(self.db_path)
+        if data_dir and not os.path.exists(data_dir):
+            os.makedirs(data_dir)
+    
+    def _connect(self) -> sqlite3.Connection:
+        """连接数据库"""
+        conn = sqlite3.connect(self.db_path)
+        conn.row_factory = sqlite3.Row
+        return conn
+    
+    def _init_schema(self):
+        """初始化表结构"""
+        cursor = self.conn.cursor()
+        
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS daily_handover_logs (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                date TEXT NOT NULL,
+                shift TEXT NOT NULL,
+                ship_name TEXT NOT NULL,
+                teu INTEGER,
+                efficiency REAL,
+                vehicles INTEGER,
+                created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+                UNIQUE(date, shift, ship_name)
+            )
+        ''')
+        
+        # 索引
+        cursor.execute('CREATE INDEX IF NOT EXISTS idx_date ON daily_handover_logs(date)')
+        cursor.execute('CREATE INDEX IF NOT EXISTS idx_ship ON daily_handover_logs(ship_name)')
+        
+        self.conn.commit()
+    
+    def insert(self, log: Dict) -> bool:
+        """插入单条记录"""
+        try:
+            cursor = self.conn.cursor()
+            cursor.execute('''
+                INSERT OR REPLACE INTO daily_handover_logs 
+                (date, shift, ship_name, teu, efficiency, vehicles)
+                VALUES (?, ?, ?, ?, ?, ?)
+            ''', (
+                log['date'], log['shift'], log['ship_name'],
+                log.get('teu'), log.get('efficiency'), log.get('vehicles')
+            ))
+            self.conn.commit()
+            return True
+        except sqlite3.Error:
+            return False
+    
+    def insert_many(self, logs: List[Dict]) -> int:
+        """批量插入"""
+        count = 0
+        for log in logs:
+            if self.insert(log):
+                count += 1
+        return count
+    
+    def query_by_date(self, date: str) -> List[Dict]:
+        """按日期查询"""
+        cursor = self.conn.cursor()
+        cursor.execute('''
+            SELECT * FROM daily_handover_logs 
+            WHERE date = ? ORDER BY shift, ship_name
+        ''', (date,))
+        return [dict(row) for row in cursor.fetchall()]
+    
+    def query_by_ship(self, ship_name: str) -> List[Dict]:
+        """按船名查询"""
+        cursor = self.conn.cursor()
+        cursor.execute('''
+            SELECT * FROM daily_handover_logs 
+            WHERE ship_name LIKE ? ORDER BY date DESC
+        ''', (f'%{ship_name}%',))
+        return [dict(row) for row in cursor.fetchall()]
+    
+    def query_all(self, limit: int = 1000) -> List[Dict]:
+        """查询所有"""
+        cursor = self.conn.cursor()
+        cursor.execute('''
+            SELECT * FROM daily_handover_logs 
+            ORDER BY date DESC, shift LIMIT ?
+        ''', (limit,))
+        return [dict(row) for row in cursor.fetchall()]
+    
+    def get_stats(self) -> Dict:
+        """获取统计信息"""
+        cursor = self.conn.cursor()
+        
+        cursor.execute('SELECT COUNT(*) FROM daily_handover_logs')
+        total = cursor.fetchone()[0]
+        
+        cursor.execute('SELECT DISTINCT ship_name FROM daily_handover_logs')
+        ships = [row[0] for row in cursor.fetchall()]
+        
+        cursor.execute('SELECT MIN(date), MAX(date) FROM daily_handover_logs')
+        date_range = cursor.fetchone()
+        
+        return {
+            'total': total,
+            'ships': ships,
+            'date_range': {'start': date_range[0], 'end': date_range[1]}
+        }
+    
+    def close(self):
+        """关闭连接"""
+        if self.conn:
+            self.conn.close()
+
+
+if __name__ == '__main__':
+    db = DailyLogsDatabase()
+    
+    # 测试插入
+    test_log = {
+        'date': '2025-12-28',
+        'shift': '白班',
+        'ship_name': '测试船',
+        'teu': 100,
+        'efficiency': 3.5,
+        'vehicles': 5
+    }
+    
+    db.insert(test_log)
+    print(f'总记录: {db.get_stats()["total"]}')
+    db.close()
--- a/src/extractor.py
+++ b/src/extractor.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""
+HTML 文本提取模块
+"""
+import re
+from bs4 import BeautifulSoup, Tag, NavigableString
+from typing import List
+
+
+class HTMLTextExtractor:
+    """HTML 文本提取器 - 保留布局结构"""
+    
+    # 块级元素列表
+    BLOCK_TAGS = {
+        'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'section',
+        'table', 'tr', 'td', 'th', 'li', 'ul', 'ol', 'blockquote',
+        'pre', 'hr', 'br', 'tbody', 'thead', 'tfoot'
+    }
+    
+    def __init__(self):
+        """初始化提取器"""
+        self.output_lines: List[str] = []
+    
+    def extract(self, html: str) -> str:
+        """
+        从HTML中提取保留布局的文本
+        
+        参数:
+            html: HTML字符串
+            
+        返回:
+            格式化的纯文本
+        """
+        if not html:
+            return ''
+        
+        soup = BeautifulSoup(html, 'html.parser')
+        
+        # 移除不需要的元素
+        for tag in soup(["script", "style", "noscript"]):
+            tag.decompose()
+        
+        # 移除 Confluence 宏
+        for macro in soup.find_all(attrs={"ac:name": True}):
+            macro.decompose()
+        
+        self.output_lines = []
+        
+        # 处理 body 或整个文档
+        body = soup.body if soup.body else soup
+        for child in body.children:
+            self._process_node(child)
+        
+        # 清理结果
+        result = ''.join(self.output_lines)
+        result = re.sub(r'\n\s*\n\s*\n', '\n\n', result)
+        result = '\n'.join(line.rstrip() for line in result.split('\n'))
+        return result.strip()
+    
+    def _process_node(self, node, indent: int = 0, list_context=None):
+        """递归处理节点"""
+        if isinstance(node, NavigableString):
+            text = str(node).strip()
+            if text:
+                text = re.sub(r'\s+', ' ', text)
+                if self.output_lines and not self.output_lines[-1].endswith('\n'):
+                    self.output_lines[-1] += text
+                else:
+                    self.output_lines.append(' ' * indent + text)
+            return
+        
+        if not isinstance(node, Tag):
+            return
+        
+        tag_name = node.name.lower()
+        is_block = tag_name in self.BLOCK_TAGS
+        
+        # 块级元素前添加换行
+        if is_block and self.output_lines and not self.output_lines[-1].endswith('\n'):
+            self.output_lines.append('\n')
+        
+        # 处理特定标签
+        if tag_name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
+            level = int(tag_name[1])
+            prefix = '#' * level + ' '
+            text = node.get_text().strip()
+            if text:
+                self.output_lines.append(' ' * indent + prefix + text + '\n')
+            return
+        
+        elif tag_name == 'p':
+            text = node.get_text().strip()
+            if text:
+                self.output_lines.append(' ' * indent + text + '\n')
+            return
+        
+        elif tag_name == 'hr':
+            self.output_lines.append(' ' * indent + '─' * 50 + '\n')
+            return
+        
+        elif tag_name == 'br':
+            self.output_lines.append('\n')
+            return
+        
+        elif tag_name == 'table':
+            self._process_table(node, indent)
+            return
+        
+        elif tag_name in ('ul', 'ol'):
+            self._process_list(node, indent, tag_name)
+            return
+        
+        elif tag_name == 'li':
+            self._process_list_item(node, indent, list_context)
+            return
+        
+        elif tag_name == 'a':
+            href = node.get('href', '')
+            text = node.get_text().strip()
+            if href and text:
+                self.output_lines.append(f'{text} ({href})')
+            elif text:
+                self.output_lines.append(text)
+            return
+        
+        elif tag_name in ('strong', 'b'):
+            text = node.get_text().strip()
+            if text:
+                self.output_lines.append(f'**{text}**')
+            return
+        
+        elif tag_name in ('em', 'i'):
+            text = node.get_text().strip()
+            if text:
+                self.output_lines.append(f'*{text}*')
+            return
+        
+        else:
+            # 默认递归处理子元素
+            for child in node.children:
+                self._process_node(child, indent, list_context)
+        
+        if is_block and self.output_lines and not self.output_lines[-1].endswith('\n'):
+            self.output_lines.append('\n')
+    
+    def _process_table(self, table: Tag, indent: int):
+        """处理表格"""
+        rows = []
+        for tr in table.find_all('tr'):
+            row = []
+            for td in tr.find_all(['td', 'th']):
+                row.append(td.get_text().strip())
+            if row:
+                rows.append(row)
+        
+        if rows:
+            # 计算列宽
+            col_widths = []
+            for i in range(max(len(r) for r in rows)):
+                col_width = max((len(r[i]) if i < len(r) else 0) for r in rows)
+                col_widths.append(col_width)
+            
+            for row in rows:
+                line = ' ' * indent
+                for i, cell in enumerate(row):
+                    width = col_widths[i] if i < len(col_widths) else 0
+                    line += cell.ljust(width) + '  '
+                self.output_lines.append(line.rstrip() + '\n')
+            self.output_lines.append('\n')
+    
+    def _process_list(self, ul: Tag, indent: int, list_type: str):
+        """处理列表"""
+        counter = 1 if list_type == 'ol' else None
+        for child in ul.children:
+            if isinstance(child, Tag) and child.name == 'li':
+                ctx = (list_type, counter) if counter else (list_type, 1)
+                self._process_list_item(child, indent, ctx)
+                if counter:
+                    counter += 1
+            else:
+                self._process_node(child, indent, (list_type, 1) if not counter else None)
+    
+    def _process_list_item(self, li: Tag, indent: int, list_context):
+        """处理列表项"""
+        prefix = ''
+        if list_context:
+            list_type, num = list_context
+            prefix = '• ' if list_type == 'ul' else f'{num}. '
+        
+        # 收集直接文本
+        direct_parts = []
+        for child in li.children:
+            if isinstance(child, NavigableString):
+                text = str(child).strip()
+                if text:
+                    direct_parts.append(text)
+            elif isinstance(child, Tag) and child.name == 'a':
+                href = child.get('href', '')
+                link_text = child.get_text().strip()
+                if href and link_text:
+                    direct_parts.append(f'{link_text} ({href})')
+        
+        if direct_parts:
+            self.output_lines.append(' ' * indent + prefix + ' '.join(direct_parts) + '\n')
+        
+        # 处理子元素
+        for child in li.children:
+            if isinstance(child, Tag) and child.name != 'a':
+                self._process_node(child, indent + 2, None)
+
+
+if __name__ == '__main__':
+    # 测试
+    html = "<h1>标题</h1><p>段落</p><ul><li>项目1</li><li>项目2</li></ul>"
+    extractor = HTMLTextExtractor()
+    print(extractor.extract(html))
--- a/src/parser.py
+++ b/src/parser.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""
+日志解析模块
+"""
+import re
+from typing import List, Dict, Optional
+from dataclasses import dataclass
+
+
+@dataclass
+class ShipLog:
+    """船次日志数据类"""
+    date: str
+    shift: str
+    ship_name: str
+    teu: Optional[int] = None
+    efficiency: Optional[float] = None
+    vehicles: Optional[int] = None
+    
+    def to_dict(self) -> Dict:
+        """转换为字典"""
+        return {
+            'date': self.date,
+            'shift': self.shift,
+            'ship_name': self.ship_name,
+            'teu': self.teu,
+            'efficiency': self.efficiency,
+            'vehicles': self.vehicles
+        }
+
+
+class HandoverLogParser:
+    """交接班日志解析器"""
+    
+    SEPARATOR = '———————————————————————————————————————————————'
+    
+    def __init__(self):
+        """初始化解析器"""
+        pass
+    
+    @staticmethod
+    def parse_date(date_str: str) -> str:
+        """解析日期字符串"""
+        try:
+            parts = date_str.split('.')
+            if len(parts) == 3:
+                return f"{parts[0]}-{parts[1]}-{parts[2]}"
+            return date_str
+        except Exception:
+            return date_str
+    
+    def parse(self, text: str) -> List[ShipLog]:
+        """
+        解析日志文本
+        
+        参数:
+            text: 日志文本
+            
+        返回:
+            船次日志列表
+        """
+        logs = []
+        blocks = text.split(self.SEPARATOR)
+        
+        for block in blocks:
+            if not block.strip() or '日期：' not in block:
+                continue
+            
+            # 解析日期
+            date_match = re.search(r'日期：(\d{4}\.\d{2}\.\d{2})', block)
+            if not date_match:
+                continue
+            
+            date = self.parse_date(date_match.group(1))
+            self._parse_block(block, date, logs)
+        
+        return logs
+    
+    def _parse_block(self, block: str, date: str, logs: List[ShipLog]):
+        """解析日期块"""
+        for shift in ['白班', '夜班']:
+            shift_pattern = f'{shift}：'
+            if shift_pattern not in block:
+                continue
+            
+            shift_start = block.find(shift_pattern) + len(shift_pattern)
+            
+            # 找到下一个班次或注意事项
+            next_pos = len(block)
+            for next_shift in ['白班', '夜班']:
+                if next_shift != shift:
+                    pos = block.find(f'{next_shift}：', shift_start)
+                    if pos != -1 and pos < next_pos:
+                        next_pos = pos
+            
+            notes_pos = block.find('注意事项：', shift_start)
+            if notes_pos != -1 and notes_pos < next_pos:
+                next_pos = notes_pos
+            
+            shift_content = block[shift_start:next_pos]
+            self._parse_ships(shift_content, date, shift, logs)
+    
+    def _parse_ships(self, content: str, date: str, shift: str, logs: List[ShipLog]):
+        """解析船次"""
+        parts = content.split('实船作业：')
+        
+        for part in parts:
+            if not part.strip():
+                continue
+            
+            cleaned = part.replace('\xa0', ' ').strip()
+            ship_match = re.search(r'#\s+(\S+)', cleaned)
+            
+            if not ship_match:
+                continue
+            
+            ship_name = ship_match.group(1)
+            vehicles_match = re.search(r'上场车辆数：(\d+)', cleaned)
+            teu_eff_match = re.search(
+                r'作业量/效率：(\d+)TEU[，,\s]+([\d.]+)循环/车/小时', cleaned
+            )
+            
+            log = ShipLog(
+                date=date,
+                shift=shift,
+                ship_name=ship_name,
+                teu=int(teu_eff_match.group(1)) if teu_eff_match else None,
+                efficiency=float(teu_eff_match.group(2)) if teu_eff_match else None,
+                vehicles=int(vehicles_match.group(1)) if vehicles_match else None
+            )
+            logs.append(log)
+
+
+if __name__ == '__main__':
+    # 测试
+    with open('layout_output.txt', 'r', encoding='utf-8') as f:
+        text = f.read()
+    
+    parser = HandoverLogParser()
+    logs = parser.parse(text)
+    
+    print(f'解析到 {len(logs)} 条记录')
+    for log in logs[:5]:
+        print(f'{log.date} {log.shift} {log.ship_name}: {log.teu}TEU')