feat: 增强 AI 入库预处理功能,支持联网补全和来源展示
This commit is contained in:
414
app.py
414
app.py
@@ -1089,7 +1089,7 @@ def _split_inbound_line_fields(line: str) -> tuple[list[str], list[str]]:
|
||||
warnings.append("未检测到逗号或Tab,已按连续空格尝试拆分")
|
||||
|
||||
if len(parts) > 5:
|
||||
parts = parts[:4] + [" ".join(parts[4:])]
|
||||
parts = parts[:4] + [" | ".join(parts[4:])]
|
||||
warnings.append("字段超过 5 列,已将多余内容合并到备注")
|
||||
|
||||
while len(parts) < 5:
|
||||
@@ -1103,6 +1103,317 @@ def _format_inbound_line(part_no: str, name: str, quantity: int, specification:
|
||||
return f"{part_no}, {name}, {safe_quantity}, {specification}, {note}".strip()
|
||||
|
||||
|
||||
def _dedupe_ordered_text(values: list[str]) -> list[str]:
|
||||
seen = set()
|
||||
output = []
|
||||
for value in values:
|
||||
text = (value or "").strip()
|
||||
if not text or text in seen:
|
||||
continue
|
||||
seen.add(text)
|
||||
output.append(text)
|
||||
return output
|
||||
|
||||
|
||||
def _split_text_fragments(value: str) -> list[str]:
|
||||
raw = (value or "").strip()
|
||||
if not raw:
|
||||
return []
|
||||
parts = [p.strip() for p in re.split(r"[\n,,;;|]+", raw) if p.strip()]
|
||||
return _dedupe_ordered_text(parts)
|
||||
|
||||
|
||||
def _normalize_inbound_name(name: str, part_no: str) -> str:
|
||||
cleaned = _compact_spaces(name).replace(",", " ")
|
||||
cleaned = re.sub(r"\s{2,}", " ", cleaned)
|
||||
if cleaned:
|
||||
return cleaned
|
||||
return (part_no or "").strip()
|
||||
|
||||
|
||||
def _normalize_inbound_specification(specification: str) -> str:
|
||||
parts = _split_text_fragments(specification)
|
||||
return " / ".join(parts[:4])
|
||||
|
||||
|
||||
def _normalize_inbound_note(note: str) -> str:
|
||||
parts = _split_text_fragments(note)
|
||||
return " | ".join(parts[:8])
|
||||
|
||||
|
||||
def _normalize_inbound_row_style(row: dict) -> dict:
|
||||
"""统一 AI 入库预处理输出风格。
|
||||
|
||||
中文说明:这里会把规格/备注从“逗号碎片”收敛为固定分隔格式,
|
||||
减少后续导入再解析时的歧义,同时保留搜索关键词。
|
||||
"""
|
||||
current = dict(row or {})
|
||||
part_no = (current.get("part_no") or "").strip()
|
||||
name = _normalize_inbound_name(current.get("name") or "", part_no)
|
||||
specification = _normalize_inbound_specification(current.get("specification") or "")
|
||||
note = _normalize_inbound_note(current.get("note") or "")
|
||||
|
||||
warnings = list(current.get("warnings") or [])
|
||||
raw = (current.get("raw") or "").strip()
|
||||
|
||||
quantity = 0
|
||||
quantity_raw = str(current.get("quantity_raw", "") or "").strip()
|
||||
try:
|
||||
quantity = _parse_non_negative_int(str(current.get("quantity", 0) or "0"), 0)
|
||||
except ValueError:
|
||||
quantity = 0
|
||||
if quantity_raw == "":
|
||||
warnings.append("未检测到数量,默认为0")
|
||||
|
||||
is_sparse_description = any("单字段描述" in str(msg or "") for msg in warnings)
|
||||
if part_no.upper().startswith("AUTO-"):
|
||||
if "待确认厂家型号" not in note:
|
||||
note = " | ".join([p for p in [note, "待确认厂家型号"] if p])
|
||||
if is_sparse_description and raw and raw != name and raw not in note:
|
||||
note = " | ".join([p for p in [note, raw] if p])
|
||||
|
||||
current.update(
|
||||
{
|
||||
"part_no": part_no,
|
||||
"name": name,
|
||||
"quantity": int(quantity),
|
||||
"specification": specification,
|
||||
"note": note,
|
||||
"warnings": _dedupe_ordered_text(warnings),
|
||||
"errors": [],
|
||||
}
|
||||
)
|
||||
|
||||
if not part_no:
|
||||
current["errors"].append("缺少料号")
|
||||
if not name:
|
||||
current["errors"].append("缺少名称")
|
||||
|
||||
current["is_valid"] = len(current["errors"]) == 0
|
||||
current["normalized_line"] = _format_inbound_line(part_no, name, quantity, specification, note)
|
||||
return current
|
||||
|
||||
|
||||
def _guess_part_no_from_free_text(text: str) -> str:
|
||||
"""从自由文本生成一个可用的临时料号。
|
||||
|
||||
中文说明:当用户只有“描述句”而不是规范料号时,先生成 AUTO- 前缀的临时料号,
|
||||
让这一行能继续进入 AI 预处理和人工确认流程,避免直接报“缺少名称/料号”。
|
||||
"""
|
||||
raw = (text or "").strip()
|
||||
if not raw:
|
||||
return "AUTO-UNKNOWN"
|
||||
|
||||
upper = raw.upper().replace("(", "(").replace(")", ")")
|
||||
tokens = re.findall(r"[A-Z]{2,}[A-Z0-9.-]*|\d+(?:\.\d+)?(?:V|A|MA|UA|UF|NF|PF|MHZ|GHZ)?", upper)
|
||||
useful = [token for token in tokens if len(token) >= 2][:3]
|
||||
|
||||
if useful:
|
||||
base = "-".join(useful)[:26].strip("-_")
|
||||
if base:
|
||||
return f"AUTO-{base}"
|
||||
|
||||
digest = hashlib.md5(raw.encode("utf-8")).hexdigest()[:8].upper()
|
||||
return f"AUTO-{digest}"
|
||||
|
||||
|
||||
def _auto_patch_sparse_inbound_fields(part_no: str, name: str, warnings: list[str]) -> tuple[str, str, list[str]]:
|
||||
patched_part_no = (part_no or "").strip()
|
||||
patched_name = (name or "").strip()
|
||||
patched_warnings = list(warnings or [])
|
||||
|
||||
# 单字段描述行: 默认把唯一文本视为名称,并生成临时料号。
|
||||
if not patched_name and patched_part_no:
|
||||
if re.search(r"[\u4e00-\u9fff]", patched_part_no) or len(patched_part_no.split()) >= 2:
|
||||
patched_name = patched_part_no
|
||||
patched_part_no = _guess_part_no_from_free_text(patched_name)
|
||||
patched_warnings.append("检测到单字段描述,已自动生成临时料号并将描述写入名称")
|
||||
|
||||
return patched_part_no, patched_name, _dedupe_ordered_text(patched_warnings)
|
||||
|
||||
|
||||
def _fetch_open_search_context(query: str, timeout: int) -> dict:
|
||||
"""通过公开搜索接口获取简短检索线索。"""
|
||||
raw_query = (query or "").strip()
|
||||
if not raw_query:
|
||||
return {"query": "", "sources": []}
|
||||
|
||||
params = urllib.parse.urlencode(
|
||||
{
|
||||
"q": raw_query,
|
||||
"format": "json",
|
||||
"no_html": "1",
|
||||
"skip_disambig": "1",
|
||||
"kl": "cn-zh",
|
||||
}
|
||||
)
|
||||
endpoint = f"https://api.duckduckgo.com/?{params}"
|
||||
req = urllib.request.Request(
|
||||
endpoint,
|
||||
method="GET",
|
||||
headers={"User-Agent": "inventory-ai-inbound/1.0"},
|
||||
)
|
||||
|
||||
def classify_source_reliability(url: str, snippet: str) -> dict:
|
||||
parsed = urllib.parse.urlparse(url or "")
|
||||
domain = (parsed.netloc or "").lower()
|
||||
if domain.startswith("www."):
|
||||
domain = domain[4:]
|
||||
|
||||
high_domains = (
|
||||
"ti.com",
|
||||
"analog.com",
|
||||
"st.com",
|
||||
"nxp.com",
|
||||
"microchip.com",
|
||||
"onsemi.com",
|
||||
"infineon.com",
|
||||
"renesas.com",
|
||||
"murata.com",
|
||||
"tdk.com",
|
||||
"jlc.com",
|
||||
"szlcsc.com",
|
||||
"mouser.com",
|
||||
"digikey.com",
|
||||
"arrow.com",
|
||||
"alldatasheet",
|
||||
"datasheet",
|
||||
)
|
||||
medium_domains = (
|
||||
"wikipedia.org",
|
||||
"baike.baidu.com",
|
||||
"elecfans.com",
|
||||
"eefocus.com",
|
||||
"51hei.com",
|
||||
"cnblogs.com",
|
||||
"csdn.net",
|
||||
"bilibili.com",
|
||||
)
|
||||
low_domains = (
|
||||
"tieba.baidu.com",
|
||||
"zhihu.com",
|
||||
"weibo.com",
|
||||
"douyin.com",
|
||||
"xiaohongshu.com",
|
||||
"taobao.com",
|
||||
"tmall.com",
|
||||
"1688.com",
|
||||
"aliexpress.com",
|
||||
)
|
||||
|
||||
snippet_text = (snippet or "").lower()
|
||||
if any(item in domain for item in high_domains):
|
||||
return {
|
||||
"reliability_level": "high",
|
||||
"reliability_label": "高可信",
|
||||
"reliability_reason": "官网/数据手册/主流分销来源",
|
||||
"domain": domain,
|
||||
}
|
||||
if any(item in domain for item in low_domains):
|
||||
return {
|
||||
"reliability_level": "low",
|
||||
"reliability_label": "低可信",
|
||||
"reliability_reason": "社区/电商/社媒内容,仅供线索参考",
|
||||
"domain": domain,
|
||||
}
|
||||
if any(item in domain for item in medium_domains):
|
||||
return {
|
||||
"reliability_level": "medium",
|
||||
"reliability_label": "中可信",
|
||||
"reliability_reason": "技术社区或百科内容,建议二次核对",
|
||||
"domain": domain,
|
||||
}
|
||||
if "datasheet" in snippet_text or "规格" in snippet_text or "参数" in snippet_text:
|
||||
return {
|
||||
"reliability_level": "medium",
|
||||
"reliability_label": "中可信",
|
||||
"reliability_reason": "文本包含参数关键词,建议核对原始链接",
|
||||
"domain": domain,
|
||||
}
|
||||
return {
|
||||
"reliability_level": "medium",
|
||||
"reliability_label": "中可信",
|
||||
"reliability_reason": "来源类型未知,建议人工确认",
|
||||
"domain": domain,
|
||||
}
|
||||
|
||||
sources = []
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="ignore")
|
||||
payload = json.loads(raw)
|
||||
except Exception:
|
||||
return {"query": raw_query, "sources": []}
|
||||
|
||||
heading = str(payload.get("Heading") or "").strip()
|
||||
abstract = str(payload.get("AbstractText") or "").strip()
|
||||
abstract_url = str(payload.get("AbstractURL") or "").strip()
|
||||
if heading or abstract:
|
||||
reliability = classify_source_reliability(abstract_url, abstract)
|
||||
sources.append(
|
||||
{
|
||||
"title": heading or raw_query,
|
||||
"snippet": abstract,
|
||||
"url": abstract_url,
|
||||
**reliability,
|
||||
}
|
||||
)
|
||||
|
||||
def append_related(items: list):
|
||||
for item in items or []:
|
||||
if isinstance(item, dict) and "Topics" in item:
|
||||
append_related(item.get("Topics") or [])
|
||||
continue
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
text = str(item.get("Text") or "").strip()
|
||||
link = str(item.get("FirstURL") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
reliability = classify_source_reliability(link, text)
|
||||
sources.append(
|
||||
{
|
||||
"title": raw_query,
|
||||
"snippet": text,
|
||||
"url": link,
|
||||
**reliability,
|
||||
}
|
||||
)
|
||||
if len(sources) >= 4:
|
||||
return
|
||||
|
||||
append_related(payload.get("RelatedTopics") or [])
|
||||
return {
|
||||
"query": raw_query,
|
||||
"sources": sources[:4],
|
||||
}
|
||||
|
||||
|
||||
def _build_inbound_online_context(rows: list[dict], timeout: int, max_lines: int = 4) -> list[dict]:
|
||||
"""为信息不完整的入库行构建联网检索上下文。"""
|
||||
contexts = []
|
||||
for row in rows:
|
||||
if len(contexts) >= max_lines:
|
||||
break
|
||||
needs_more = (not row.get("name")) or (not row.get("specification")) or (not row.get("note"))
|
||||
if not needs_more:
|
||||
continue
|
||||
query = (row.get("raw") or row.get("name") or row.get("part_no") or "").strip()
|
||||
if len(query) < 2:
|
||||
continue
|
||||
result = _fetch_open_search_context(query, timeout=timeout)
|
||||
if not result.get("sources"):
|
||||
continue
|
||||
contexts.append(
|
||||
{
|
||||
"line_no": row.get("line_no"),
|
||||
"query": result.get("query", ""),
|
||||
"sources": result.get("sources", []),
|
||||
}
|
||||
)
|
||||
return contexts
|
||||
|
||||
|
||||
def _parse_inbound_preview_rows(raw_lines: list[str]) -> list[dict]:
|
||||
rows = []
|
||||
|
||||
@@ -1114,34 +1425,30 @@ def _parse_inbound_preview_rows(raw_lines: list[str]) -> list[dict]:
|
||||
specification = (parts[3] or "").strip()
|
||||
note = (parts[4] or "").strip()
|
||||
|
||||
errors = []
|
||||
if not part_no:
|
||||
errors.append("缺少料号")
|
||||
if not name:
|
||||
errors.append("缺少名称")
|
||||
part_no, name, warnings = _auto_patch_sparse_inbound_fields(part_no, name, warnings)
|
||||
|
||||
quantity = 0
|
||||
try:
|
||||
quantity = _parse_non_negative_int(quantity_raw, 0)
|
||||
except ValueError:
|
||||
errors.append("数量格式错误,必须是大于等于 0 的整数")
|
||||
warnings.append("数量格式异常,已按0处理")
|
||||
quantity = 0
|
||||
|
||||
rows.append(
|
||||
{
|
||||
"line_no": line_no,
|
||||
"raw": line,
|
||||
"part_no": part_no,
|
||||
"name": name,
|
||||
"quantity": int(quantity),
|
||||
"quantity_raw": quantity_raw,
|
||||
"specification": specification,
|
||||
"note": note,
|
||||
"errors": errors,
|
||||
"warnings": warnings,
|
||||
"is_valid": len(errors) == 0,
|
||||
"normalized_line": _format_inbound_line(part_no, name, quantity, specification, note),
|
||||
}
|
||||
)
|
||||
row = {
|
||||
"line_no": line_no,
|
||||
"raw": line,
|
||||
"part_no": part_no,
|
||||
"name": name,
|
||||
"quantity": int(quantity),
|
||||
"quantity_raw": quantity_raw,
|
||||
"specification": specification,
|
||||
"note": note,
|
||||
"errors": [],
|
||||
"warnings": warnings,
|
||||
"is_valid": True,
|
||||
"normalized_line": "",
|
||||
}
|
||||
rows.append(_normalize_inbound_row_style(row))
|
||||
|
||||
return rows
|
||||
|
||||
@@ -1188,12 +1495,6 @@ def _normalize_ai_inbound_rows(ai_rows: list, fallback_rows: list[dict]) -> list
|
||||
# AI 数量不可信时保留规则解析值,不覆盖。
|
||||
pass
|
||||
|
||||
errors = []
|
||||
if not part_no:
|
||||
errors.append("缺少料号")
|
||||
if not name:
|
||||
errors.append("缺少名称")
|
||||
|
||||
warnings = list(current.get("warnings", []))
|
||||
for w in raw_row.get("warnings", []) if isinstance(raw_row.get("warnings", []), list) else []:
|
||||
text = str(w or "").strip()
|
||||
@@ -1207,17 +1508,22 @@ def _normalize_ai_inbound_rows(ai_rows: list, fallback_rows: list[dict]) -> list
|
||||
"quantity": int(quantity),
|
||||
"specification": specification,
|
||||
"note": note,
|
||||
"errors": errors,
|
||||
"warnings": warnings,
|
||||
"is_valid": len(errors) == 0,
|
||||
"normalized_line": _format_inbound_line(part_no, name, quantity, specification, note),
|
||||
}
|
||||
)
|
||||
current = _normalize_inbound_row_style(current)
|
||||
by_line[line_no] = current
|
||||
|
||||
return [by_line[idx] for idx in sorted(by_line.keys())]
|
||||
|
||||
|
||||
def _ai_enhance_inbound_preview(raw_lines: list[str], mode: str, fallback_rows: list[dict], settings: dict) -> tuple[list[dict], str]:
|
||||
def _ai_enhance_inbound_preview(
|
||||
raw_lines: list[str],
|
||||
mode: str,
|
||||
fallback_rows: list[dict],
|
||||
settings: dict,
|
||||
use_web_search: bool = False,
|
||||
) -> tuple[list[dict], str, list[dict]]:
|
||||
"""使用 AI 对规则解析结果做二次修正。
|
||||
|
||||
中文说明:AI 负责“更聪明地拆分和纠错”,但最终仍会做字段约束;
|
||||
@@ -1226,9 +1532,24 @@ def _ai_enhance_inbound_preview(raw_lines: list[str], mode: str, fallback_rows:
|
||||
api_key = (settings.get("api_key") or "").strip()
|
||||
api_url = (settings.get("api_url") or "").strip()
|
||||
model = (settings.get("model") or "").strip()
|
||||
web_notice = ""
|
||||
|
||||
online_context = []
|
||||
if use_web_search:
|
||||
online_context = _build_inbound_online_context(
|
||||
fallback_rows,
|
||||
timeout=min(12, int(settings.get("timeout", 30))),
|
||||
max_lines=4,
|
||||
)
|
||||
if online_context:
|
||||
web_notice = f"已联网检索补充 {len(online_context)} 行参考信息"
|
||||
else:
|
||||
web_notice = "已尝试联网检索,但未找到可用补充信息"
|
||||
|
||||
if not api_key or not api_url or not model:
|
||||
return fallback_rows, "AI 参数未完整配置,已使用规则解析结果"
|
||||
if web_notice:
|
||||
return fallback_rows, f"AI 参数未完整配置,已使用规则解析结果;{web_notice}", online_context
|
||||
return fallback_rows, "AI 参数未完整配置,已使用规则解析结果", online_context
|
||||
|
||||
numbered_lines = [{"line_no": idx, "raw": line} for idx, line in enumerate(raw_lines, start=1)]
|
||||
system_prompt = (
|
||||
@@ -1237,6 +1558,7 @@ def _ai_enhance_inbound_preview(raw_lines: list[str], mode: str, fallback_rows:
|
||||
"请输出对象: {\"rows\":[{\"line_no\":number,\"part_no\":string,\"name\":string,\"quantity\":number,\"specification\":string,\"note\":string,\"warnings\":string[]}]}。"
|
||||
"不要新增或删除行号;每个 line_no 仅返回一条。"
|
||||
"quantity 必须是 >=0 的整数;无法确定时返回 0 并在 warnings 中说明。"
|
||||
"当原始信息不足时,可结合提供的联网检索摘要补全 name/specification/note,并保留 AUTO- 临时料号。"
|
||||
)
|
||||
user_prompt = (
|
||||
f"导入模式: {mode}\n"
|
||||
@@ -1245,6 +1567,8 @@ def _ai_enhance_inbound_preview(raw_lines: list[str], mode: str, fallback_rows:
|
||||
+ "\n规则解析参考(JSON):\n"
|
||||
+ json.dumps(fallback_rows, ensure_ascii=False)
|
||||
)
|
||||
if online_context:
|
||||
user_prompt += "\n联网检索补充(JSON):\n" + json.dumps(online_context, ensure_ascii=False)
|
||||
|
||||
try:
|
||||
suggestion = _call_siliconflow_chat(
|
||||
@@ -1257,9 +1581,16 @@ def _ai_enhance_inbound_preview(raw_lines: list[str], mode: str, fallback_rows:
|
||||
)
|
||||
parsed = json.loads(_extract_json_object_block(suggestion))
|
||||
ai_rows = parsed.get("rows", []) if isinstance(parsed, dict) else []
|
||||
return _normalize_ai_inbound_rows(ai_rows, fallback_rows), ""
|
||||
normalized_rows = _normalize_ai_inbound_rows(ai_rows, fallback_rows)
|
||||
notice_parts = []
|
||||
notice_parts.append("已自动规范规格为“ / ”分隔、备注为“ | ”分隔")
|
||||
if web_notice:
|
||||
notice_parts.append(web_notice)
|
||||
return normalized_rows, ";".join(notice_parts), online_context
|
||||
except Exception:
|
||||
return fallback_rows, "AI 解析失败,已自动回退到规则解析结果"
|
||||
if web_notice:
|
||||
return fallback_rows, f"AI 解析失败,已自动回退到规则解析结果;{web_notice}", online_context
|
||||
return fallback_rows, "AI 解析失败,已自动回退到规则解析结果", online_context
|
||||
|
||||
|
||||
def log_inventory_event(
|
||||
@@ -3548,6 +3879,7 @@ def ai_inbound_parse():
|
||||
"""
|
||||
raw_text = request.form.get("lines", "")
|
||||
mode = (request.form.get("mode", "box") or "box").strip().lower()
|
||||
use_web_search = _is_truthy_form_value(request.form.get("use_web_search", ""))
|
||||
if mode not in {"box", "bag"}:
|
||||
mode = "box"
|
||||
|
||||
@@ -3557,7 +3889,13 @@ def ai_inbound_parse():
|
||||
|
||||
fallback_rows = _parse_inbound_preview_rows(lines)
|
||||
settings = _get_ai_settings()
|
||||
rows, parse_notice = _ai_enhance_inbound_preview(lines, mode, fallback_rows, settings)
|
||||
rows, parse_notice, web_context = _ai_enhance_inbound_preview(
|
||||
lines,
|
||||
mode,
|
||||
fallback_rows,
|
||||
settings,
|
||||
use_web_search=use_web_search,
|
||||
)
|
||||
|
||||
valid_rows = [row for row in rows if row.get("is_valid")]
|
||||
invalid_rows = [row for row in rows if not row.get("is_valid")]
|
||||
@@ -3566,10 +3904,12 @@ def ai_inbound_parse():
|
||||
return {
|
||||
"ok": True,
|
||||
"mode": mode,
|
||||
"use_web_search": use_web_search,
|
||||
"total_lines": len(rows),
|
||||
"valid_count": len(valid_rows),
|
||||
"invalid_count": len(invalid_rows),
|
||||
"parse_notice": parse_notice,
|
||||
"web_context": web_context,
|
||||
"rows": rows,
|
||||
"normalized_lines": normalized_lines,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user