""" Blog Embeddings Generator ========================= 遍歷 Docusaurus blog 資料夾,為每篇文章產生 embedding 向量, 連同 slug 一起存成 JSON 檔。 支援增量模式:如果 JSON 已存在,只處理新增或內容有變動的文章。 用 --full 強制全部重跑。 需求: - ollama 已啟動且已 pull qwen3-embedding:8b - pip install pyyaml requests """ import os import sys import glob import json import hashlib import yaml import requests import time # ============================================================ # 設定區 # ============================================================ # 你的 Docusaurus blog 資料夾路徑 BLOG_DIR = "./blog" # 輸出檔案路徑 OUTPUT_FILE = "./blog_embeddings.json" # Ollama 設定 OLLAMA_URL = "http://localhost:11434/api/embed" OLLAMA_MODEL = "qwen3-embedding:8b" # ============================================================ def extract_meta(content: str, filepath: str) -> tuple[str | None, str | None]: """從 frontmatter 抽出 slug 和 title。""" if not content.startswith("---"): return None, None parts = content.split("---", 2) if len(parts) < 3: return None, None try: meta = yaml.safe_load(parts[1]) if isinstance(meta, dict): return meta.get("slug"), meta.get("title") except yaml.YAMLError: print(f" ⚠ YAML 解析失敗: {filepath}") return None, None def content_hash(content: str) -> str: """算出內容的 hash,用來判斷文章有沒有改過。""" return hashlib.sha256(content.encode("utf-8")).hexdigest() def get_embedding(text: str) -> list[float]: """透過 Ollama API 取得 embedding 向量。""" resp = requests.post(OLLAMA_URL, json={ "model": OLLAMA_MODEL, "input": text, }) resp.raise_for_status() return resp.json()["embeddings"][0] def load_existing(path: str) -> list[dict]: """讀取現有的 JSON,沒有就回空 list。""" if not os.path.exists(path): return [] with open(path, encoding="utf-8") as f: return json.load(f) def main(): full_mode = "--full" in sys.argv # 找出所有 .md / .mdx 檔 patterns = [ os.path.join(BLOG_DIR, "**", "*.md"), os.path.join(BLOG_DIR, "**", "*.mdx"), ] files = [] for p in patterns: files.extend(glob.glob(p, recursive=True)) files = sorted(set(files)) print(f"找到 {len(files)} 個 md/mdx 檔案") # 載入現有資料,建立 file -> entry 的索引 if full_mode: existing = {} print("模式:完整重建\n") else: existing_list = load_existing(OUTPUT_FILE) existing = {item["file"]: item for item in existing_list} print(f"模式:增量更新(現有 {len(existing)} 篇)\n") results = [] skipped = 0 reused = 0 processed = 0 for i, filepath in enumerate(files, 1): with open(filepath, encoding="utf-8") as f: content = f.read() slug, title = extract_meta(content, filepath) if not slug: print(f"[{i}/{len(files)}] 跳過(無 slug): {filepath}") skipped += 1 continue h = content_hash(content) # 如果檔案已存在且 hash 沒變,直接沿用 if filepath in existing and existing[filepath].get("hash") == h: print(f"[{i}/{len(files)}] 沒變,沿用: {slug}") results.append(existing[filepath]) reused += 1 continue print(f"[{i}/{len(files)}] 處理中: {slug}") start = time.time() embedding = get_embedding(content) elapsed = time.time() - start print(f" ✓ {len(embedding)} 維,耗時 {elapsed:.1f}s") results.append({ "slug": slug, "title": title, "file": filepath, "hash": h, "embedding": embedding, }) processed += 1 # 寫出 JSON(只讀不動原檔案) with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\n完成!") print(f" 新增/更新: {processed} 篇") print(f" 沿用舊的: {reused} 篇") print(f" 跳過: {skipped} 篇(無 slug)") print(f" 輸出至: {OUTPUT_FILE}") if __name__ == "__main__": main()