feat: 增强 AI 入库预处理功能，支持联网补全和来源展示

2026-03-13 21:26:20 +08:00
parent 8e0bd4f995
commit c193caa88c
4 changed files with 550 additions and 45 deletions
--- a/app.py
+++ b/app.py
@@ -1089,7 +1089,7 @@ def _split_inbound_line_fields(line: str) -> tuple[list[str], list[str]]:
        warnings.append("未检测到逗号或Tab，已按连续空格尝试拆分")

    if len(parts) > 5:
-        parts = parts[:4] + [" ".join(parts[4:])]
+        parts = parts[:4] + [" | ".join(parts[4:])]
        warnings.append("字段超过 5 列，已将多余内容合并到备注")

    while len(parts) < 5:
@@ -1103,6 +1103,317 @@ def _format_inbound_line(part_no: str, name: str, quantity: int, specification:
    return f"{part_no}, {name}, {safe_quantity}, {specification}, {note}".strip()


+def _dedupe_ordered_text(values: list[str]) -> list[str]:
+    seen = set()
+    output = []
+    for value in values:
+        text = (value or "").strip()
+        if not text or text in seen:
+            continue
+        seen.add(text)
+        output.append(text)
+    return output
+
+
+def _split_text_fragments(value: str) -> list[str]:
+    raw = (value or "").strip()
+    if not raw:
+        return []
+    parts = [p.strip() for p in re.split(r"[\n,，;；|]+", raw) if p.strip()]
+    return _dedupe_ordered_text(parts)
+
+
+def _normalize_inbound_name(name: str, part_no: str) -> str:
+    cleaned = _compact_spaces(name).replace("，", " ")
+    cleaned = re.sub(r"\s{2,}", " ", cleaned)
+    if cleaned:
+        return cleaned
+    return (part_no or "").strip()
+
+
+def _normalize_inbound_specification(specification: str) -> str:
+    parts = _split_text_fragments(specification)
+    return " / ".join(parts[:4])
+
+
+def _normalize_inbound_note(note: str) -> str:
+    parts = _split_text_fragments(note)
+    return " | ".join(parts[:8])
+
+
+def _normalize_inbound_row_style(row: dict) -> dict:
+    """统一 AI 入库预处理输出风格。
+
+    中文说明：这里会把规格/备注从“逗号碎片”收敛为固定分隔格式，
+    减少后续导入再解析时的歧义，同时保留搜索关键词。
+    """
+    current = dict(row or {})
+    part_no = (current.get("part_no") or "").strip()
+    name = _normalize_inbound_name(current.get("name") or "", part_no)
+    specification = _normalize_inbound_specification(current.get("specification") or "")
+    note = _normalize_inbound_note(current.get("note") or "")
+
+    warnings = list(current.get("warnings") or [])
+    raw = (current.get("raw") or "").strip()
+
+    quantity = 0
+    quantity_raw = str(current.get("quantity_raw", "") or "").strip()
+    try:
+        quantity = _parse_non_negative_int(str(current.get("quantity", 0) or "0"), 0)
+    except ValueError:
+        quantity = 0
+    if quantity_raw == "":
+        warnings.append("未检测到数量，默认为0")
+
+    is_sparse_description = any("单字段描述" in str(msg or "") for msg in warnings)
+    if part_no.upper().startswith("AUTO-"):
+        if "待确认厂家型号" not in note:
+            note = " | ".join([p for p in [note, "待确认厂家型号"] if p])
+        if is_sparse_description and raw and raw != name and raw not in note:
+            note = " | ".join([p for p in [note, raw] if p])
+
+    current.update(
+        {
+            "part_no": part_no,
+            "name": name,
+            "quantity": int(quantity),
+            "specification": specification,
+            "note": note,
+            "warnings": _dedupe_ordered_text(warnings),
+            "errors": [],
+        }
+    )
+
+    if not part_no:
+        current["errors"].append("缺少料号")
+    if not name:
+        current["errors"].append("缺少名称")
+
+    current["is_valid"] = len(current["errors"]) == 0
+    current["normalized_line"] = _format_inbound_line(part_no, name, quantity, specification, note)
+    return current
+
+
+def _guess_part_no_from_free_text(text: str) -> str:
+    """从自由文本生成一个可用的临时料号。
+
+    中文说明：当用户只有“描述句”而不是规范料号时，先生成 AUTO- 前缀的临时料号，
+    让这一行能继续进入 AI 预处理和人工确认流程，避免直接报“缺少名称/料号”。
+    """
+    raw = (text or "").strip()
+    if not raw:
+        return "AUTO-UNKNOWN"
+
+    upper = raw.upper().replace("（", "(").replace("）", ")")
+    tokens = re.findall(r"[A-Z]{2,}[A-Z0-9.-]*|\d+(?:\.\d+)?(?:V|A|MA|UA|UF|NF|PF|MHZ|GHZ)?", upper)
+    useful = [token for token in tokens if len(token) >= 2][:3]
+
+    if useful:
+        base = "-".join(useful)[:26].strip("-_")
+        if base:
+            return f"AUTO-{base}"
+
+    digest = hashlib.md5(raw.encode("utf-8")).hexdigest()[:8].upper()
+    return f"AUTO-{digest}"
+
+
+def _auto_patch_sparse_inbound_fields(part_no: str, name: str, warnings: list[str]) -> tuple[str, str, list[str]]:
+    patched_part_no = (part_no or "").strip()
+    patched_name = (name or "").strip()
+    patched_warnings = list(warnings or [])
+
+    # 单字段描述行: 默认把唯一文本视为名称，并生成临时料号。
+    if not patched_name and patched_part_no:
+        if re.search(r"[\u4e00-\u9fff]", patched_part_no) or len(patched_part_no.split()) >= 2:
+            patched_name = patched_part_no
+            patched_part_no = _guess_part_no_from_free_text(patched_name)
+            patched_warnings.append("检测到单字段描述，已自动生成临时料号并将描述写入名称")
+
+    return patched_part_no, patched_name, _dedupe_ordered_text(patched_warnings)
+
+
+def _fetch_open_search_context(query: str, timeout: int) -> dict:
+    """通过公开搜索接口获取简短检索线索。"""
+    raw_query = (query or "").strip()
+    if not raw_query:
+        return {"query": "", "sources": []}
+
+    params = urllib.parse.urlencode(
+        {
+            "q": raw_query,
+            "format": "json",
+            "no_html": "1",
+            "skip_disambig": "1",
+            "kl": "cn-zh",
+        }
+    )
+    endpoint = f"https://api.duckduckgo.com/?{params}"
+    req = urllib.request.Request(
+        endpoint,
+        method="GET",
+        headers={"User-Agent": "inventory-ai-inbound/1.0"},
+    )
+
+    def classify_source_reliability(url: str, snippet: str) -> dict:
+        parsed = urllib.parse.urlparse(url or "")
+        domain = (parsed.netloc or "").lower()
+        if domain.startswith("www."):
+            domain = domain[4:]
+
+        high_domains = (
+            "ti.com",
+            "analog.com",
+            "st.com",
+            "nxp.com",
+            "microchip.com",
+            "onsemi.com",
+            "infineon.com",
+            "renesas.com",
+            "murata.com",
+            "tdk.com",
+            "jlc.com",
+            "szlcsc.com",
+            "mouser.com",
+            "digikey.com",
+            "arrow.com",
+            "alldatasheet",
+            "datasheet",
+        )
+        medium_domains = (
+            "wikipedia.org",
+            "baike.baidu.com",
+            "elecfans.com",
+            "eefocus.com",
+            "51hei.com",
+            "cnblogs.com",
+            "csdn.net",
+            "bilibili.com",
+        )
+        low_domains = (
+            "tieba.baidu.com",
+            "zhihu.com",
+            "weibo.com",
+            "douyin.com",
+            "xiaohongshu.com",
+            "taobao.com",
+            "tmall.com",
+            "1688.com",
+            "aliexpress.com",
+        )
+
+        snippet_text = (snippet or "").lower()
+        if any(item in domain for item in high_domains):
+            return {
+                "reliability_level": "high",
+                "reliability_label": "高可信",
+                "reliability_reason": "官网/数据手册/主流分销来源",
+                "domain": domain,
+            }
+        if any(item in domain for item in low_domains):
+            return {
+                "reliability_level": "low",
+                "reliability_label": "低可信",
+                "reliability_reason": "社区/电商/社媒内容，仅供线索参考",
+                "domain": domain,
+            }
+        if any(item in domain for item in medium_domains):
+            return {
+                "reliability_level": "medium",
+                "reliability_label": "中可信",
+                "reliability_reason": "技术社区或百科内容，建议二次核对",
+                "domain": domain,
+            }
+        if "datasheet" in snippet_text or "规格" in snippet_text or "参数" in snippet_text:
+            return {
+                "reliability_level": "medium",
+                "reliability_label": "中可信",
+                "reliability_reason": "文本包含参数关键词，建议核对原始链接",
+                "domain": domain,
+            }
+        return {
+            "reliability_level": "medium",
+            "reliability_label": "中可信",
+            "reliability_reason": "来源类型未知，建议人工确认",
+            "domain": domain,
+        }
+
+    sources = []
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            raw = resp.read().decode("utf-8", errors="ignore")
+        payload = json.loads(raw)
+    except Exception:
+        return {"query": raw_query, "sources": []}
+
+    heading = str(payload.get("Heading") or "").strip()
+    abstract = str(payload.get("AbstractText") or "").strip()
+    abstract_url = str(payload.get("AbstractURL") or "").strip()
+    if heading or abstract:
+        reliability = classify_source_reliability(abstract_url, abstract)
+        sources.append(
+            {
+                "title": heading or raw_query,
+                "snippet": abstract,
+                "url": abstract_url,
+                **reliability,
+            }
+        )
+
+    def append_related(items: list):
+        for item in items or []:
+            if isinstance(item, dict) and "Topics" in item:
+                append_related(item.get("Topics") or [])
+                continue
+            if not isinstance(item, dict):
+                continue
+            text = str(item.get("Text") or "").strip()
+            link = str(item.get("FirstURL") or "").strip()
+            if not text:
+                continue
+            reliability = classify_source_reliability(link, text)
+            sources.append(
+                {
+                    "title": raw_query,
+                    "snippet": text,
+                    "url": link,
+                    **reliability,
+                }
+            )
+            if len(sources) >= 4:
+                return
+
+    append_related(payload.get("RelatedTopics") or [])
+    return {
+        "query": raw_query,
+        "sources": sources[:4],
+    }
+
+
+def _build_inbound_online_context(rows: list[dict], timeout: int, max_lines: int = 4) -> list[dict]:
+    """为信息不完整的入库行构建联网检索上下文。"""
+    contexts = []
+    for row in rows:
+        if len(contexts) >= max_lines:
+            break
+        needs_more = (not row.get("name")) or (not row.get("specification")) or (not row.get("note"))
+        if not needs_more:
+            continue
+        query = (row.get("raw") or row.get("name") or row.get("part_no") or "").strip()
+        if len(query) < 2:
+            continue
+        result = _fetch_open_search_context(query, timeout=timeout)
+        if not result.get("sources"):
+            continue
+        contexts.append(
+            {
+                "line_no": row.get("line_no"),
+                "query": result.get("query", ""),
+                "sources": result.get("sources", []),
+            }
+        )
+    return contexts
+
+
 def _parse_inbound_preview_rows(raw_lines: list[str]) -> list[dict]:
    rows = []

@@ -1114,34 +1425,30 @@ def _parse_inbound_preview_rows(raw_lines: list[str]) -> list[dict]:
        specification = (parts[3] or "").strip()
        note = (parts[4] or "").strip()

-        errors = []
-        if not part_no:
-            errors.append("缺少料号")
-        if not name:
-            errors.append("缺少名称")
+        part_no, name, warnings = _auto_patch_sparse_inbound_fields(part_no, name, warnings)

        quantity = 0
        try:
            quantity = _parse_non_negative_int(quantity_raw, 0)
        except ValueError:
-            errors.append("数量格式错误，必须是大于等于 0 的整数")
+            warnings.append("数量格式异常，已按0处理")
+            quantity = 0

-        rows.append(
-            {
-                "line_no": line_no,
-                "raw": line,
-                "part_no": part_no,
-                "name": name,
-                "quantity": int(quantity),
-                "quantity_raw": quantity_raw,
-                "specification": specification,
-                "note": note,
-                "errors": errors,
-                "warnings": warnings,
-                "is_valid": len(errors) == 0,
-                "normalized_line": _format_inbound_line(part_no, name, quantity, specification, note),
-            }
-        )
+        row = {
+            "line_no": line_no,
+            "raw": line,
+            "part_no": part_no,
+            "name": name,
+            "quantity": int(quantity),
+            "quantity_raw": quantity_raw,
+            "specification": specification,
+            "note": note,
+            "errors": [],
+            "warnings": warnings,
+            "is_valid": True,
+            "normalized_line": "",
+        }
+        rows.append(_normalize_inbound_row_style(row))

    return rows

@@ -1188,12 +1495,6 @@ def _normalize_ai_inbound_rows(ai_rows: list, fallback_rows: list[dict]) -> list
            # AI 数量不可信时保留规则解析值，不覆盖。
            pass

-        errors = []
-        if not part_no:
-            errors.append("缺少料号")
-        if not name:
-            errors.append("缺少名称")
-
        warnings = list(current.get("warnings", []))
        for w in raw_row.get("warnings", []) if isinstance(raw_row.get("warnings", []), list) else []:
            text = str(w or "").strip()
@@ -1207,17 +1508,22 @@ def _normalize_ai_inbound_rows(ai_rows: list, fallback_rows: list[dict]) -> list
                "quantity": int(quantity),
                "specification": specification,
                "note": note,
-                "errors": errors,
                "warnings": warnings,
-                "is_valid": len(errors) == 0,
-                "normalized_line": _format_inbound_line(part_no, name, quantity, specification, note),
            }
        )
+        current = _normalize_inbound_row_style(current)
+        by_line[line_no] = current

    return [by_line[idx] for idx in sorted(by_line.keys())]


-def _ai_enhance_inbound_preview(raw_lines: list[str], mode: str, fallback_rows: list[dict], settings: dict) -> tuple[list[dict], str]:
+def _ai_enhance_inbound_preview(
+    raw_lines: list[str],
+    mode: str,
+    fallback_rows: list[dict],
+    settings: dict,
+    use_web_search: bool = False,
+) -> tuple[list[dict], str, list[dict]]:
    """使用 AI 对规则解析结果做二次修正。

    中文说明：AI 负责“更聪明地拆分和纠错”，但最终仍会做字段约束；
@@ -1226,9 +1532,24 @@ def _ai_enhance_inbound_preview(raw_lines: list[str], mode: str, fallback_rows:
    api_key = (settings.get("api_key") or "").strip()
    api_url = (settings.get("api_url") or "").strip()
    model = (settings.get("model") or "").strip()
+    web_notice = ""
+
+    online_context = []
+    if use_web_search:
+        online_context = _build_inbound_online_context(
+            fallback_rows,
+            timeout=min(12, int(settings.get("timeout", 30))),
+            max_lines=4,
+        )
+        if online_context:
+            web_notice = f"已联网检索补充 {len(online_context)} 行参考信息"
+        else:
+            web_notice = "已尝试联网检索，但未找到可用补充信息"

    if not api_key or not api_url or not model:
-        return fallback_rows, "AI 参数未完整配置，已使用规则解析结果"
+        if web_notice:
+            return fallback_rows, f"AI 参数未完整配置，已使用规则解析结果；{web_notice}", online_context
+        return fallback_rows, "AI 参数未完整配置，已使用规则解析结果", online_context

    numbered_lines = [{"line_no": idx, "raw": line} for idx, line in enumerate(raw_lines, start=1)]
    system_prompt = (
@@ -1237,6 +1558,7 @@ def _ai_enhance_inbound_preview(raw_lines: list[str], mode: str, fallback_rows:
        "请输出对象: {\"rows\":[{\"line_no\":number,\"part_no\":string,\"name\":string,\"quantity\":number,\"specification\":string,\"note\":string,\"warnings\":string[]}]}。"
        "不要新增或删除行号；每个 line_no 仅返回一条。"
        "quantity 必须是 >=0 的整数；无法确定时返回 0 并在 warnings 中说明。"
+        "当原始信息不足时，可结合提供的联网检索摘要补全 name/specification/note，并保留 AUTO- 临时料号。"
    )
    user_prompt = (
        f"导入模式: {mode}\n"
@@ -1245,6 +1567,8 @@ def _ai_enhance_inbound_preview(raw_lines: list[str], mode: str, fallback_rows:
        + "\n规则解析参考(JSON):\n"
        + json.dumps(fallback_rows, ensure_ascii=False)
    )
+    if online_context:
+        user_prompt += "\n联网检索补充(JSON):\n" + json.dumps(online_context, ensure_ascii=False)

    try:
        suggestion = _call_siliconflow_chat(
@@ -1257,9 +1581,16 @@ def _ai_enhance_inbound_preview(raw_lines: list[str], mode: str, fallback_rows:
        )
        parsed = json.loads(_extract_json_object_block(suggestion))
        ai_rows = parsed.get("rows", []) if isinstance(parsed, dict) else []
-        return _normalize_ai_inbound_rows(ai_rows, fallback_rows), ""
+        normalized_rows = _normalize_ai_inbound_rows(ai_rows, fallback_rows)
+        notice_parts = []
+        notice_parts.append("已自动规范规格为“ / ”分隔、备注为“ | ”分隔")
+        if web_notice:
+            notice_parts.append(web_notice)
+        return normalized_rows, "；".join(notice_parts), online_context
    except Exception:
-        return fallback_rows, "AI 解析失败，已自动回退到规则解析结果"
+        if web_notice:
+            return fallback_rows, f"AI 解析失败，已自动回退到规则解析结果；{web_notice}", online_context
+        return fallback_rows, "AI 解析失败，已自动回退到规则解析结果", online_context


 def log_inventory_event(
@@ -3548,6 +3879,7 @@ def ai_inbound_parse():
    """
    raw_text = request.form.get("lines", "")
    mode = (request.form.get("mode", "box") or "box").strip().lower()
+    use_web_search = _is_truthy_form_value(request.form.get("use_web_search", ""))
    if mode not in {"box", "bag"}:
        mode = "box"

@@ -3557,7 +3889,13 @@ def ai_inbound_parse():

    fallback_rows = _parse_inbound_preview_rows(lines)
    settings = _get_ai_settings()
-    rows, parse_notice = _ai_enhance_inbound_preview(lines, mode, fallback_rows, settings)
+    rows, parse_notice, web_context = _ai_enhance_inbound_preview(
+        lines,
+        mode,
+        fallback_rows,
+        settings,
+        use_web_search=use_web_search,
+    )

    valid_rows = [row for row in rows if row.get("is_valid")]
    invalid_rows = [row for row in rows if not row.get("is_valid")]
@@ -3566,10 +3904,12 @@ def ai_inbound_parse():
    return {
        "ok": True,
        "mode": mode,
+        "use_web_search": use_web_search,
        "total_lines": len(rows),
        "valid_count": len(valid_rows),
        "invalid_count": len(invalid_rows),
        "parse_notice": parse_notice,
+        "web_context": web_context,
        "rows": rows,
        "normalized_lines": normalized_lines,
    }