feat: Python RAG Worker + NestJS 内部 API（文档解析/切片/embedding/Qdrant/候选生成）

2026-05-19 22:35:12 +08:00 · 2026-05-19 22:35:12 +08:00 · fbdae9078f
commit fbdae9078f
parent c149b96b04
13 changed files with 962 additions and 0 deletions
--- a/rag-worker/api_client.py
+++ b/rag-worker/api_client.py
@ -0,0 +1,94 @@
+import httpx
+from config import API_BASE_URL, RAG_WORKER_SECRET, WORKER_ID
+
+_auth_headers = {
+    "Authorization": f"Bearer {RAG_WORKER_SECRET}",
+    "X-Worker-Id": WORKER_ID,
+}
+
+
+async def get_next_job() -> dict | None:
+    """获取下一个 QUEUED 导入任务"""
+    async with httpx.AsyncClient(timeout=30) as client:
+        resp = await client.get(
+            f"{API_BASE_URL}/internal/rag/jobs/next",
+            headers=_auth_headers,
+        )
+        if resp.status_code == 200:
+            data = resp.json()
+            return data.get("data") or data.get("job")
+        return None
+
+
+async def claim_job(job_id: str) -> bool:
+    """认领任务"""
+    async with httpx.AsyncClient(timeout=30) as client:
+        resp = await client.post(
+            f"{API_BASE_URL}/internal/rag/jobs/{job_id}/claim",
+            headers=_auth_headers,
+        )
+        return resp.status_code == 200
+
+
+async def heartbeat(job_id: str) -> bool:
+    """发送心跳"""
+    async with httpx.AsyncClient(timeout=10) as client:
+        resp = await client.post(
+            f"{API_BASE_URL}/internal/rag/jobs/{job_id}/heartbeat",
+            headers=_auth_headers,
+        )
+        return resp.status_code == 200
+
+
+async def update_job_status(job_id: str, status: str, data: dict | None = None):
+    """更新导入任务状态"""
+    async with httpx.AsyncClient(timeout=30) as client:
+        await client.post(
+            f"{API_BASE_URL}/internal/rag/jobs/{job_id}/status",
+            headers=_auth_headers,
+            json={"status": status, **(data or {})},
+        )
+
+
+async def save_chunks(chunks: list[dict]):
+    """批量保存 KnowledgeChunk"""
+    async with httpx.AsyncClient(timeout=60) as client:
+        await client.post(
+            f"{API_BASE_URL}/internal/rag/chunks",
+            headers=_auth_headers,
+            json={"chunks": chunks},
+        )
+
+
+async def save_candidates(
+    user_id: str,
+    kb_id: str,
+    source_id: str,
+    import_id: str,
+    candidates: list[dict],
+):
+    """保存候选知识点"""
+    async with httpx.AsyncClient(timeout=60) as client:
+        await client.post(
+            f"{API_BASE_URL}/internal/rag/candidates",
+            headers=_auth_headers,
+            json={
+                "userId": user_id,
+                "knowledgeBaseId": kb_id,
+                "sourceId": source_id,
+                "importId": import_id,
+                "candidates": candidates,
+            },
+        )
+
+
+async def get_job_detail(job_id: str) -> dict | None:
+    """获取任务详情（含 source 信息）"""
+    async with httpx.AsyncClient(timeout=30) as client:
+        resp = await client.get(
+            f"{API_BASE_URL}/internal/rag/jobs/{job_id}",
+            headers=_auth_headers,
+        )
+        if resp.status_code == 200:
+            return resp.json()
+        return None
--- a/rag-worker/candidate_generator.py
+++ b/rag-worker/candidate_generator.py
@ -0,0 +1,103 @@
+"""候选知识点生成：调用 DeepSeek 分析文本，生成 ImportCandidate"""
+
+import json
+import httpx
+from config import DEEPSEEK_API_KEY, DEEPSEEK_BASE_URL, DEEPSEEK_MODEL
+
+MAX_CANDIDATES = 30
+MIN_CANDIDATES = 3
+CHARS_PER_CANDIDATE = 2000
+
+_PROMPT = """你是一个学习助手。请分析以下文档内容，提取关键知识点。
+
+对于每个知识点，请提供：
+- title: 知识点标题（简洁，不超过 30 字）
+- summary: 一句话概述（不超过 80 字）
+- content: 详细解释（基于原文，保持准确）
+- tags: 2-4 个标签
+- recallQuestions: 1-2 个主动回忆问题
+- difficulty: 难度评估（easy/medium/hard）
+- confidence: 你对这个知识点重要性的置信度（0.0-1.0）
+
+请以 JSON 数组格式返回，每个元素是一个知识点：
+```json
+[{
+  "title": "知识点标题",
+  "summary": "一句话概述",
+  "content": "详细解释...",
+  "tags": ["标签1", "标签2"],
+  "recallQuestions": ["问题1？", "问题2？"],
+  "difficulty": "medium",
+  "confidence": 0.85
+}]
+```
+
+文档内容：
+{text}
+"""
+
+
+async def generate_candidates(text: str) -> list[dict]:
+    """用 DeepSeek 生成候选知识点"""
+    # 估算生成数量
+    text_len = len(text)
+    expected_count = max(MIN_CANDIDATES, min(MAX_CANDIDATES, text_len // CHARS_PER_CANDIDATE))
+
+    prompt = _PROMPT.format(text=text[:16000])  # 限制上下文长度
+
+    async with httpx.AsyncClient(timeout=120) as client:
+        resp = await client.post(
+            f"{DEEPSEEK_BASE_URL}/chat/completions",
+            headers={"Authorization": f"Bearer {DEEPSEEK_API_KEY}"},
+            json={
+                "model": DEEPSEEK_MODEL,
+                "messages": [
+                    {"role": "system", "content": "你是一个专业的学习内容分析师。请始终返回有效的 JSON 数组。"},
+                    {"role": "user", "content": prompt},
+                ],
+                "temperature": 0.3,
+                "max_tokens": 4096,
+            },
+        )
+        if resp.status_code != 200:
+            raise RuntimeError(f"DeepSeek API error: {resp.status_code} {resp.text}")
+
+        data = resp.json()
+        raw = data["choices"][0]["message"]["content"]
+
+    # 提取 JSON
+    return _parse_json_response(raw, expected_count)
+
+
+def _parse_json_response(raw: str, expected_count: int) -> list[dict]:
+    """从 AI 回复中提取 JSON 数组"""
+    # 尝试直接解析
+    try:
+        candidates = json.loads(raw)
+        if isinstance(candidates, list):
+            return candidates[:MAX_CANDIDATES]
+    except json.JSONDecodeError:
+        pass
+
+    # 提取 ```json ... ``` 块
+    import re
+    m = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", raw, re.DOTALL)
+    if m:
+        try:
+            candidates = json.loads(m.group(1))
+            if isinstance(candidates, list):
+                return candidates[:MAX_CANDIDATES]
+        except json.JSONDecodeError:
+            pass
+
+    # 提取 [ ... ] 块
+    m = re.search(r"\[.*\]", raw, re.DOTALL)
+    if m:
+        try:
+            candidates = json.loads(m.group(0))
+            if isinstance(candidates, list):
+                return candidates[:MAX_CANDIDATES]
+        except json.JSONDecodeError:
+            pass
+
+    raise ValueError(f"无法解析 AI 候选知识点回复: {raw[:500]}")
--- a/rag-worker/chunker.py
+++ b/rag-worker/chunker.py
@ -0,0 +1,120 @@
+"""文本切片：递归字符分割 + 中文分句保护"""
+
+import re
+from config import CHUNK_SIZE, CHUNK_OVERLAP
+
+
+# 中文分句模式
+_CN_SENT_PATTERN = re.compile(
+    r"([。！？；\n]|(?<!\d)\.(?!\d)|!\?|\?!)"
+)
+# Markdown 标题
+_MD_HEADING = re.compile(r"^#{1,6}\s+", re.MULTILINE)
+
+
+def _split_sentences(text: str) -> list[str]:
+    """按中文标点分句，保留标点在句尾"""
+    parts = _CN_SENT_PATTERN.split(text)
+    sentences = []
+    buf = ""
+    for p in parts:
+        if not p:
+            continue
+        buf += p
+        if _CN_SENT_PATTERN.match(p):
+            sentences.append(buf)
+            buf = ""
+    if buf.strip():
+        sentences.append(buf)
+    return sentences
+
+
+def _split_by_heading(md_text: str) -> list[dict]:
+    """按 Markdown 标题分层切片，保留标题作为 sectionTitle"""
+    lines = md_text.split("\n")
+    chunks = []
+    current_title = ""
+    current_text = ""
+
+    for line in lines:
+        m = _MD_HEADING.match(line)
+        if m:
+            # 保存前一段
+            if current_text.strip():
+                chunks.append({"sectionTitle": current_title, "text": current_text.strip()})
+            current_title = line.strip()
+            current_text = ""
+        else:
+            current_text += line + "\n"
+
+    if current_text.strip():
+        chunks.append({"sectionTitle": current_title, "text": current_text.strip()})
+
+    return chunks if chunks else [{"sectionTitle": "", "text": md_text}]
+
+
+def _estimate_tokens(text: str) -> int:
+    """粗略估算 token 数量（中文按字符数，英文按词数）"""
+    cn_chars = len(re.findall(r"[一-鿿]", text))
+    en_words = len(re.findall(r"[a-zA-Z]+", text))
+    # 中文约 1.5 字符/token，英文约 1 词/token
+    return int(cn_chars / 1.5) + en_words
+
+
+def _chunk_text(text: str, section_title: str = "", page_number: int | None = None) -> list[dict]:
+    """递归分割 + 重叠切块"""
+    sentences = _split_sentences(text)
+    chunks = []
+    buf = ""
+    buf_tokens = 0
+
+    for s in sentences:
+        s_tokens = _estimate_tokens(s)
+        if buf_tokens + s_tokens > CHUNK_SIZE and buf_tokens > 0:
+            chunks.append({"content": buf.strip(), "sectionTitle": section_title, "pageNumber": page_number})
+            # 重叠：保留最后 overlap tokens
+            if CHUNK_OVERLAP > 0:
+                overlap_text = buf[-int(CHUNK_OVERLAP * 2):]  # 粗略估算
+                buf = overlap_text + s
+                buf_tokens = _estimate_tokens(overlap_text) + s_tokens
+            else:
+                buf = s
+                buf_tokens = s_tokens
+        else:
+            buf += s
+            buf_tokens += s_tokens
+
+    if buf.strip():
+        chunks.append({"content": buf.strip(), "sectionTitle": section_title, "pageNumber": page_number})
+
+    return chunks
+
+
+def chunk_document(text: str, source_type: str = "text") -> list[dict]:
+    """
+    对文档进行切片，返回 chunk 列表。
+    每个 chunk: {content, sectionTitle, pageNumber, chunkType}
+    """
+    if source_type in ("md", "markdown"):
+        sections = _split_by_heading(text)
+    else:
+        sections = [{"sectionTitle": "", "text": text}]
+
+    all_chunks = []
+    for sec in sections:
+        sec_chunks = _chunk_text(sec["text"], section_title=sec.get("sectionTitle", ""))
+        all_chunks.extend(sec_chunks)
+
+    # 添加 chunkType
+    for i, c in enumerate(all_chunks):
+        c["chunkIndex"] = i
+        # 检测表格/代码块
+        content = c["content"]
+        if content.count("|") > 5 and "---" in content:
+            c["chunkType"] = "table"
+        elif content.strip().startswith("```") or "```" in content:
+            c["chunkType"] = "code"
+        else:
+            c["chunkType"] = "text"
+
+    return all_chunks
--- a/rag-worker/config.py
+++ b/rag-worker/config.py
@ -0,0 +1,29 @@
+import os
+
+# NestJS 内部 API
+API_BASE_URL = os.getenv("API_BASE_URL", "http://127.0.0.1:3000")
+RAG_WORKER_SECRET = os.getenv("RAG_WORKER_SECRET", "")
+
+# SiliconFlow
+SILICONFLOW_API_KEY = os.getenv("SILICONFLOW_API_KEY", "")
+SILICONFLOW_BASE_URL = os.getenv("SILICONFLOW_BASE_URL", "https://api.siliconflow.cn/v1")
+EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "BAAI/bge-m3")
+EMBEDDING_DIM = int(os.getenv("EMBEDDING_DIM", "1024"))
+
+# DeepSeek
+DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
+DEEPSEEK_BASE_URL = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com/v1")
+DEEPSEEK_MODEL = os.getenv("DEEPSEEK_MODEL", "deepseek-chat")
+
+# Qdrant
+QDRANT_URL = os.getenv("QDRANT_URL", "http://127.0.0.1:6333")
+QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "zhixi_chunks")
+
+# Chunking
+CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "512"))
+CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "64"))
+
+# Worker
+WORKER_ID = os.getenv("WORKER_ID", f"worker-{os.getpid()}")
+POLL_INTERVAL = int(os.getenv("POLL_INTERVAL", "5"))
+HEARTBEAT_INTERVAL = int(os.getenv("HEARTBEAT_INTERVAL", "30"))
--- a/rag-worker/embedder.py
+++ b/rag-worker/embedder.py
@ -0,0 +1,63 @@
+"""Embedding 服务：调用硅基流动 bge-m3"""
+
+import asyncio
+import httpx
+from config import (
+    SILICONFLOW_API_KEY,
+    SILICONFLOW_BASE_URL,
+    EMBEDDING_MODEL,
+    EMBEDDING_DIM,
+)
+
+BATCH_SIZE = 50
+MAX_RETRIES = 2
+
+
+async def embed_single(text: str) -> list[float]:
+    """单条文本 embedding"""
+    async with httpx.AsyncClient(timeout=30) as client:
+        resp = await client.post(
+            f"{SILICONFLOW_BASE_URL}/embeddings",
+            headers={"Authorization": f"Bearer {SILICONFLOW_API_KEY}"},
+            json={
+                "model": EMBEDDING_MODEL,
+                "input": [text],
+            },
+        )
+        if resp.status_code != 200:
+            raise RuntimeError(f"Embedding API error: {resp.status_code} {resp.text}")
+        data = resp.json()
+        return data["data"][0]["embedding"]
+
+
+async def embed_batch(texts: list[str]) -> list[list[float]]:
+    """批量 embedding，自动分批 + 重试"""
+    all_embeddings = []
+
+    for i in range(0, len(texts), BATCH_SIZE):
+        batch = texts[i:i + BATCH_SIZE]
+        for attempt in range(MAX_RETRIES + 1):
+            try:
+                async with httpx.AsyncClient(timeout=60) as client:
+                    resp = await client.post(
+                        f"{SILICONFLOW_BASE_URL}/embeddings",
+                        headers={"Authorization": f"Bearer {SILICONFLOW_API_KEY}"},
+                        json={
+                            "model": EMBEDDING_MODEL,
+                            "input": batch,
+                        },
+                    )
+                    if resp.status_code == 200:
+                        data = resp.json()
+                        all_embeddings.extend([d["embedding"] for d in data["data"]])
+                        break
+                    else:
+                        err = f"Status {resp.status_code}"
+                        if attempt == MAX_RETRIES:
+                            raise RuntimeError(f"Embedding batch failed after {MAX_RETRIES} retries: {err}")
+            except Exception as e:
+                if attempt == MAX_RETRIES:
+                    raise RuntimeError(f"Embedding batch failed: {e}")
+                await asyncio.sleep(2 ** attempt)
+
+    return all_embeddings
--- a/rag-worker/indexer.py
+++ b/rag-worker/indexer.py
@ -0,0 +1,60 @@
+"""Qdrant 索引服务"""
+
+import httpx
+from config import QDRANT_URL, QDRANT_COLLECTION
+
+
+async def upsert_points(points: list[dict]):
+    """批量写入 Qdrant points"""
+    async with httpx.AsyncClient(timeout=60) as client:
+        resp = await client.put(
+            f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points",
+            params={"wait": "true"},
+            json={"points": points},
+        )
+        if resp.status_code != 200:
+            raise RuntimeError(f"Qdrant upsert failed: {resp.text}")
+
+
+async def search(
+    vector: list[float],
+    user_id: str,
+    knowledge_base_id: str,
+    top_k: int = 5,
+) -> list[dict]:
+    """语义检索"""
+    async with httpx.AsyncClient(timeout=30) as client:
+        resp = await client.post(
+            f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search",
+            json={
+                "vector": vector,
+                "filter": {
+                    "must": [
+                        {"key": "userId", "match": {"value": user_id}},
+                        {"key": "knowledgeBaseId", "match": {"value": knowledge_base_id}},
+                        {"key": "deleted", "match": {"value": False}},
+                    ],
+                },
+                "limit": top_k,
+                "with_payload": True,
+            },
+        )
+        if resp.status_code != 200:
+            raise RuntimeError(f"Qdrant search failed: {resp.text}")
+        return resp.json()["result"]
+
+
+async def mark_deleted(source_id: str):
+    """将指定 source 的所有 points 标记为 deleted=true"""
+    async with httpx.AsyncClient(timeout=30) as client:
+        await client.post(
+            f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/update",
+            json={
+                "filter": {
+                    "must": [
+                        {"key": "sourceId", "match": {"value": source_id}},
+                    ]
+                },
+                "set": {"deleted": True},
+            },
+        )
--- a/rag-worker/main.py
+++ b/rag-worker/main.py
@ -0,0 +1,84 @@
+"""知习 RAG Worker — 文档导入主进程"""
+
+import asyncio
+import signal
+import sys
+from config import WORKER_ID, POLL_INTERVAL, HEARTBEAT_INTERVAL
+from api_client import get_next_job, claim_job, heartbeat, update_job_status
+from pipelines.import_pipeline import run_import
+
+running = True
+
+
+def shutdown(sig, frame):
+    global running
+    print(f"[{WORKER_ID}] 收到信号 {sig}，正在退出...")
+    running = False
+
+
+signal.signal(signal.SIGINT, shutdown)
+signal.signal(signal.SIGTERM, shutdown)
+
+
+async def heartbeat_loop():
+    """心跳循环（所有活跃任务）"""
+    # 简化实现：worker 级心跳，后续可扩展到 per-job 心跳
+    while running:
+        await asyncio.sleep(HEARTBEAT_INTERVAL)
+
+
+async def work_loop():
+    """主工作循环：轮询 → 认领 → 执行"""
+    print(f"[{WORKER_ID}] RAG Worker 已启动")
+
+    while running:
+        try:
+            job = await get_next_job()
+            if not job:
+                await asyncio.sleep(POLL_INTERVAL)
+                continue
+
+            job_id = job.get("id") or job.get("jobId")
+            if not job_id:
+                continue
+
+            # 认领任务
+            claimed = await claim_job(job_id)
+            if not claimed:
+                continue
+
+            print(f"[{WORKER_ID}] 开始处理任务 {job_id}")
+
+            # 启动心跳（后台任务）
+            hb_task = asyncio.create_task(_per_job_heartbeat(job_id))
+
+            try:
+                await run_import(job)
+                print(f"[{WORKER_ID}] 任务 {job_id} 完成")
+            except Exception as e:
+                print(f"[{WORKER_ID}] 任务 {job_id} 失败: {e}")
+                await update_job_status(job_id, "FAILED_RETRYABLE", {
+                    "errorMessage": str(e)[:500],
+                })
+            finally:
+                hb_task.cancel()
+
+        except Exception as e:
+            print(f"[{WORKER_ID}] 轮询异常: {e}")
+            await asyncio.sleep(POLL_INTERVAL)
+
+    print(f"[{WORKER_ID}] Worker 已停止")
+
+
+async def _per_job_heartbeat(job_id: str):
+    """单个任务的心跳上报"""
+    while running:
+        try:
+            await heartbeat(job_id)
+        except Exception:
+            pass
+        await asyncio.sleep(HEARTBEAT_INTERVAL)
+
+
+if __name__ == "__main__":
+    asyncio.run(work_loop())
--- a/rag-worker/parser.py
+++ b/rag-worker/parser.py
@ -0,0 +1,137 @@
+"""文档解析：PDF / DOCX / TXT / MD / CSV / XLSX"""
+
+import os
+import io
+import base64
+import httpx
+from config import SILICONFLOW_API_KEY, SILICONFLOW_BASE_URL
+
+
+async def download_file(url: str, local_path: str) -> str:
+    """从 COS 预签名 URL 下载文件到本地"""
+    async with httpx.AsyncClient(timeout=120, follow_redirects=True) as client:
+        resp = await client.get(url)
+        resp.raise_for_status()
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        with open(local_path, "wb") as f:
+            f.write(resp.content)
+    return local_path
+
+
+def parse_txt(file_path: str) -> str:
+    with open(file_path, "r", encoding="utf-8", errors="replace") as f:
+        return f.read()
+
+
+def parse_markdown(file_path: str) -> str:
+    return parse_txt(file_path)
+
+
+def parse_docx(file_path: str) -> str:
+    from docx import Document
+    doc = Document(file_path)
+    return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
+
+
+def parse_pdf_text(file_path: str) -> str:
+    """用 PyMuPDF 提取 PDF 文本层"""
+    import fitz
+    doc = fitz.open(file_path)
+    pages = []
+    for page in doc:
+        text = page.get_text()
+        if text.strip():
+            pages.append(text)
+    doc.close()
+    return "\n\n".join(pages)
+
+
+def pdf_needs_ocr(file_path: str) -> bool:
+    """判断 PDF 是否需要 OCR（文本层为空或极少文字）"""
+    import fitz
+    doc = fitz.open(file_path)
+    total_len = sum(len(page.get_text().strip()) for page in doc)
+    doc.close()
+    # 平均每页少于 50 字符 → 扫描件
+    page_count = max(doc.page_count if hasattr(doc, 'page_count') else len(doc), 1)
+    return (total_len / page_count) < 50
+
+
+async def ocr_with_siliconflow(image_bytes: bytes) -> str:
+    """用硅基流动多模态模型做 OCR / 图文识别"""
+    b64 = base64.b64encode(image_bytes).decode()
+    async with httpx.AsyncClient(timeout=60) as client:
+        resp = await client.post(
+            f"{SILICONFLOW_BASE_URL}/chat/completions",
+            headers={"Authorization": f"Bearer {SILICONFLOW_API_KEY}"},
+            json={
+                "model": "Qwen/Qwen3-VL-32B-Instruct",
+                "messages": [{
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "请识别并提取这张图片中的所有文字内容。如果有表格，请用 Markdown 表格格式输出。不要添加任何解释。"},
+                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}},
+                    ],
+                }],
+                "max_tokens": 4096,
+            },
+        )
+        data = resp.json()
+        return data["choices"][0]["message"]["content"]
+
+
+async def parse_image_with_ocr(file_path: str) -> str:
+    """对图片进行 OCR"""
+    with open(file_path, "rb") as f:
+        image_bytes = f.read()
+    return await ocr_with_siliconflow(image_bytes)
+
+
+def parse_csv(file_path: str) -> str:
+    import pandas as pd
+    df = pd.read_csv(file_path)
+    return df.to_markdown(index=False)
+
+
+def parse_xlsx(file_path: str) -> str:
+    import pandas as pd
+    df = pd.read_excel(file_path)
+    return df.to_markdown(index=False)
+
+
+async def parse_document(file_path: str, mime_type: str) -> str:
+    """根据文件类型路由到合适的解析器"""
+    ext = os.path.splitext(file_path)[1].lower()
+
+    if ext in (".txt",):
+        return parse_txt(file_path)
+    elif ext in (".md", ".markdown"):
+        return parse_markdown(file_path)
+    elif ext in (".docx",):
+        return parse_docx(file_path)
+    elif ext in (".csv",):
+        return parse_csv(file_path)
+    elif ext in (".xlsx",):
+        return parse_xlsx(file_path)
+    elif ext in (".pdf",):
+        if pdf_needs_ocr(file_path):
+            # 扫描件——先尝试文本提取，空则走多模态
+            text = parse_pdf_text(file_path)
+            if len(text.strip()) < 100:
+                # 全扫描件，逐页 OCR
+                import fitz
+                doc = fitz.open(file_path)
+                results = []
+                for i, page in enumerate(doc):
+                    pix = page.get_pixmap(dpi=150)
+                    img_bytes = pix.tobytes("png")
+                    page_text = await ocr_with_siliconflow(img_bytes)
+                    results.append(page_text)
+                doc.close()
+                return "\n\n".join(results)
+            return text
+        return parse_pdf_text(file_path)
+    elif ext in (".png", ".jpg", ".jpeg", ".webp", ".heic", ".bmp"):
+        return await parse_image_with_ocr(file_path)
+    else:
+        raise ValueError(f"不支持的文件类型: {ext}")
--- a/rag-worker/pipelines/import_pipeline.py
+++ b/rag-worker/pipelines/import_pipeline.py
@ -0,0 +1,127 @@
+"""导入主流程：下载 → 解析 → 清洗 → 切片 → embedding → Qdrant → AI 候选"""
+
+import os
+import uuid
+from parser import download_file, parse_document
+from chunker import chunk_document
+from embedder import embed_batch
+from indexer import upsert_points
+from candidate_generator import generate_candidates
+from api_client import (
+    heartbeat as send_heartbeat,
+    update_job_status,
+    save_chunks,
+    save_candidates,
+    get_job_detail,
+)
+
+
+async def run_import(job: dict):
+    """执行完整的文档导入流程"""
+    job_id = job["id"]
+    source_id = job.get("sourceId") or job.get("source_id")
+    user_id = job["userId"] or job.get("user_id")
+    kb_id = job["knowledgeBaseId"] or job.get("knowledge_base_id")
+    file_id = job.get("fileId") or job.get("file_id")
+
+    if not source_id:
+        raise ValueError(f"任务 {job_id} 缺少 sourceId")
+
+    # 获取 source 详情（从 NestJS）
+    detail = await get_job_detail(job_id)
+    source = (detail or {}).get("source", {}) if detail else {}
+    mime_type = source.get("mimeType") or source.get("mime_type") or "text/plain"
+    original_filename = source.get("originalFilename") or source.get("original_filename") or "unknown"
+
+    tmp_dir = f"/data/tmp/imports/{job_id}"
+    file_path = os.path.join(tmp_dir, original_filename)
+
+    try:
+        # 1. 下载文件
+        await update_job_status(job_id, "DOWNLOADING", {"progress": 5})
+        file_url = source.get("downloadUrl") or (detail or {}).get("downloadUrl", "")
+        if file_url:
+            await download_file(file_url, file_path)
+
+        # 2. 解析
+        await update_job_status(job_id, "PARSING", {"progress": 20})
+        text = await parse_document(file_path, mime_type)
+
+        # 如果文件不在本地（纯文本导入），直接从 source/import 中取文本
+        if not text and (job.get("rawText") or source.get("rawText")):
+            text = job.get("rawText", "") or source.get("rawText", "")
+
+        if not text or len(text.strip()) < 10:
+            raise ValueError("文档解析后内容过少，可能为空白或损坏文件")
+
+        # 3. 清洗
+        await update_job_status(job_id, "CLEANING", {"progress": 40, "textLength": len(text)})
+
+        # 4. 切片
+        await update_job_status(job_id, "CHUNKING", {"progress": 50})
+        source_type = source.get("type") or "text"
+        chunks = chunk_document(text, source_type)
+
+        # 5. Embedding
+        await update_job_status(job_id, "EMBEDDING", {"progress": 60})
+        texts = [c["content"] for c in chunks]
+        vectors = await embed_batch(texts)
+
+        # 6. Qdrant 索引
+        await update_job_status(job_id, "INDEXING", {"progress": 80})
+        points = []
+        chunk_records = []
+        for i, (chunk, vec) in enumerate(zip(chunks, vectors)):
+            chunk_id = f"chunk_{source_id}_{i}"
+            points.append({
+                "id": chunk_id,
+                "vector": vec,
+                "payload": {
+                    "userId": user_id,
+                    "knowledgeBaseId": kb_id,
+                    "sourceId": source_id,
+                    "chunkId": chunk_id,
+                    "pageNumber": chunk.get("pageNumber"),
+                    "sectionTitle": chunk.get("sectionTitle", ""),
+                    "deleted": False,
+                },
+            })
+            chunk_records.append({
+                "userId": user_id,
+                "knowledgeBaseId": kb_id,
+                "sourceId": source_id,
+                "content": chunk["content"],
+                "chunkIndex": chunk["chunkIndex"],
+                "pageNumber": chunk.get("pageNumber"),
+                "sectionTitle": chunk.get("sectionTitle", ""),
+                "tokenCount": len(chunk["content"]),
+                "externalVectorId": chunk_id,
+                "embeddingModel": "bge-m3",
+                "embeddingStatus": "COMPLETED",
+                "metadataJson": {"chunkType": chunk.get("chunkType", "text")},
+            })
+
+        await upsert_points(points)
+        await save_chunks(chunk_records)
+
+        # 7. 生成候选知识点
+        await update_job_status(job_id, "GENERATING_CANDIDATES", {"progress": 90})
+        candidates = await generate_candidates(text)
+        if candidates:
+            await save_candidates(user_id, kb_id, source_id, job_id, candidates)
+
+        # 8. 完成
+        await update_job_status(job_id, "COMPLETED", {"progress": 100})
+
+    except Exception as e:
+        await update_job_status(job_id, "FAILED_RETRYABLE", {
+            "errorCode": "WORKER_ERROR",
+            "errorMessage": str(e)[:500],
+        })
+        raise
+
+    finally:
+        # 清理临时文件
+        if os.path.exists(tmp_dir):
+            import shutil
+            shutil.rmtree(tmp_dir, ignore_errors=True)
--- a/rag-worker/requirements.txt
+++ b/rag-worker/requirements.txt
@ -0,0 +1,8 @@
+httpx>=0.27
+pydantic>=2.0
+pymupdf>=1.24
+python-docx>=1.1
+markdown>=3.5
+pandas>=2.0
+openpyxl>=3.1
+Pillow>=10.0
--- a/src/app.module.ts
+++ b/src/app.module.ts
@ -28,6 +28,7 @@ import { FilesModule } from './modules/files/files.module';
 import { WaitlistModule } from './modules/waitlist/waitlist.module';
 import { KnowledgeSourceModule } from './modules/knowledge-source/knowledge-source.module';
 import { ImportCandidateModule } from './modules/import-candidate/import-candidate.module';
+import { RagModule } from './modules/rag/rag.module';

 import { JwtAuthGuard } from './common/guards/jwt-auth.guard';
 import { RolesGuard } from './common/guards/roles.guard';
@ -85,6 +86,7 @@ import appleConfig from './config/apple.config';
    KnowledgeSourceModule,
    ImportCandidateModule,
    DocumentImportModule,
+    RagModule,
    LearningSessionModule,
    ActiveRecallModule,
    AiAnalysisModule,
--- a/src/modules/rag/internal-rag.controller.ts
+++ b/src/modules/rag/internal-rag.controller.ts
@ -0,0 +1,124 @@
+import { Controller, Get, Post, Body, Param } from '@nestjs/common';
+import { ApiTags } from '@nestjs/swagger';
+import { DocumentImportRepository } from '../document-import/document-import.repository';
+import { KnowledgeSourceRepository } from '../knowledge-source/knowledge-source.repository';
+import { ImportCandidateRepository } from '../import-candidate/import-candidate.repository';
+import { PrismaService } from '../../infrastructure/database/prisma.service';
+
+@ApiTags('internal-rag')
+@Controller('internal/rag')
+export class InternalRagController {
+  constructor(
+    private readonly importRepo: DocumentImportRepository,
+    private readonly sourceRepo: KnowledgeSourceRepository,
+    private readonly candidateRepo: ImportCandidateRepository,
+    private readonly prisma: PrismaService,
+  ) {}
+
+  @Get('jobs/next')
+  async getNextJob() {
+    const job = await this.importRepo.claimNext('');  // 先查询，不认领
+    if (!job) return { job: null };
+    return {
+      job: {
+        id: job.id,
+        userId: job.userId,
+        knowledgeBaseId: job.knowledgeBaseId,
+        sourceId: job.sourceId,
+        fileId: job.fileId,
+        sourceType: job.sourceType,
+        sourceName: job.sourceName,
+        rawText: job.rawText,
+        status: job.status,
+      },
+    };
+  }
+
+  @Get('jobs/:id')
+  async getJobDetail(@Param('id') id: string) {
+    const job = await this.importRepo.findById(id);
+    if (!job) return { job: null };
+
+    let source = null;
+    let downloadUrl = null;
+    if (job.sourceId) {
+      source = await this.sourceRepo.findById(job.sourceId);
+    }
+
+    return {
+      job: {
+        id: job.id,
+        userId: job.userId,
+        knowledgeBaseId: job.knowledgeBaseId,
+        sourceId: job.sourceId,
+        fileId: job.fileId,
+        rawText: job.rawText,
+        status: job.status,
+      },
+      source: source ? {
+        id: source.id,
+        type: source.type,
+        originalFilename: source.originalFilename,
+        mimeType: source.mimeType,
+        sizeBytes: Number(source.sizeBytes),
+        originalObjectKey: source.originalObjectKey,
+      } : null,
+    };
+  }
+
+  @Post('jobs/:id/claim')
+  async claimJob(@Param('id') id: string, @Body() body: { workerId?: string }) {
+    const workerId = body.workerId || 'unknown';
+    const result = await this.importRepo.claim(id, workerId);
+    return { success: result.count > 0 };
+  }
+
+  @Post('jobs/:id/heartbeat')
+  async heartbeat(@Param('id') id: string) {
+    await this.importRepo.heartbeat(id);
+    return { success: true };
+  }
+
+  @Post('jobs/:id/status')
+  async updateStatus(
+    @Param('id') id: string,
+    @Body() body: { status: string; progress?: number; errorCode?: string; errorMessage?: string },
+  ) {
+    await this.importRepo.updateStatus(id, body.status, {
+      step: body.status,
+      progress: body.progress,
+      errorCode: body.errorCode,
+      errorMessage: body.errorMessage,
+    });
+    return { success: true };
+  }
+
+  @Post('chunks')
+  async saveChunks(@Body() body: { chunks: any[] }) {
+    const chunks = body.chunks || [];
+    if (chunks.length > 0) {
+      await this.prisma.knowledgeChunk.createMany({ data: chunks });
+    }
+    return { success: true, count: chunks.length };
+  }
+
+  @Post('candidates')
+  async saveCandidates(
+    @Body() body: {
+      userId: string;
+      knowledgeBaseId: string;
+      sourceId: string;
+      importId: string;
+      candidates: any[];
+    },
+  ) {
+    await this.candidateRepo.createMany(
+      body.userId,
+      body.knowledgeBaseId,
+      body.sourceId,
+      body.importId,
+      body.candidates || [],
+    );
+    return { success: true, count: body.candidates?.length || 0 };
+  }
+}
--- a/src/modules/rag/rag.module.ts
+++ b/src/modules/rag/rag.module.ts
@ -0,0 +1,11 @@
+import { Module } from '@nestjs/common';
+import { InternalRagController } from './internal-rag.controller';
+import { DocumentImportModule } from '../document-import/document-import.module';
+import { KnowledgeSourceModule } from '../knowledge-source/knowledge-source.module';
+import { ImportCandidateModule } from '../import-candidate/import-candidate.module';
+
+@Module({
+  imports: [DocumentImportModule, KnowledgeSourceModule, ImportCandidateModule],
+  controllers: [InternalRagController],
+})
+export class RagModule {}