diff --git a/blog_embedding.py b/blog_embedding.py new file mode 100644 index 0000000..800426a --- /dev/null +++ b/blog_embedding.py @@ -0,0 +1,155 @@ +""" +Blog Embeddings Generator +========================= +遍歷 Docusaurus blog 資料夾,為每篇文章產生 embedding 向量, +連同 slug 一起存成 JSON 檔。 + +支援增量模式:如果 JSON 已存在,只處理新增或內容有變動的文章。 +用 --full 強制全部重跑。 + +需求: + - ollama 已啟動且已 pull qwen3-embedding:8b + - pip install pyyaml requests +""" + +import os +import sys +import glob +import json +import hashlib +import yaml +import requests +import time + +# ============================================================ +# 設定區 +# ============================================================ + +# 你的 Docusaurus blog 資料夾路徑 +BLOG_DIR = "/home/wiwi/Syncthing/WiwiWisdom/blog" + +# 輸出檔案路徑 +OUTPUT_FILE = "./blog_embeddings.json" + +# Ollama 設定 +OLLAMA_URL = "http://localhost:11434/api/embed" +OLLAMA_MODEL = "qwen3-embedding:8b" + +# ============================================================ + + +def extract_meta(content: str, filepath: str) -> tuple[str | None, str | None]: + """從 frontmatter 抽出 slug 和 title。""" + if not content.startswith("---"): + return None, None + parts = content.split("---", 2) + if len(parts) < 3: + return None, None + try: + meta = yaml.safe_load(parts[1]) + if isinstance(meta, dict): + return meta.get("slug"), meta.get("title") + except yaml.YAMLError: + print(f" ⚠ YAML 解析失敗: {filepath}") + return None, None + + +def content_hash(content: str) -> str: + """算出內容的 hash,用來判斷文章有沒有改過。""" + return hashlib.sha256(content.encode("utf-8")).hexdigest() + + +def get_embedding(text: str) -> list[float]: + """透過 Ollama API 取得 embedding 向量。""" + resp = requests.post(OLLAMA_URL, json={ + "model": OLLAMA_MODEL, + "input": text, + }) + resp.raise_for_status() + return resp.json()["embeddings"][0] + + +def load_existing(path: str) -> list[dict]: + """讀取現有的 JSON,沒有就回空 list。""" + if not os.path.exists(path): + return [] + with open(path, encoding="utf-8") as f: + return json.load(f) + + +def main(): + full_mode = "--full" in sys.argv + + # 找出所有 .md / .mdx 檔 + patterns = [ + os.path.join(BLOG_DIR, "**", "*.md"), + os.path.join(BLOG_DIR, "**", "*.mdx"), + ] + files = [] + for p in patterns: + files.extend(glob.glob(p, recursive=True)) + files = sorted(set(files)) + + print(f"找到 {len(files)} 個 md/mdx 檔案") + + # 載入現有資料,建立 file -> entry 的索引 + if full_mode: + existing = {} + print("模式:完整重建\n") + else: + existing_list = load_existing(OUTPUT_FILE) + existing = {item["file"]: item for item in existing_list} + print(f"模式:增量更新(現有 {len(existing)} 篇)\n") + + results = [] + skipped = 0 + reused = 0 + processed = 0 + + for i, filepath in enumerate(files, 1): + with open(filepath, encoding="utf-8") as f: + content = f.read() + + slug, title = extract_meta(content, filepath) + if not slug: + print(f"[{i}/{len(files)}] 跳過(無 slug): {filepath}") + skipped += 1 + continue + + h = content_hash(content) + + # 如果檔案已存在且 hash 沒變,直接沿用 + if filepath in existing and existing[filepath].get("hash") == h: + print(f"[{i}/{len(files)}] 沒變,沿用: {slug}") + results.append(existing[filepath]) + reused += 1 + continue + + print(f"[{i}/{len(files)}] 處理中: {slug}") + start = time.time() + embedding = get_embedding(content) + elapsed = time.time() - start + print(f" ✓ {len(embedding)} 維,耗時 {elapsed:.1f}s") + + results.append({ + "slug": slug, + "title": title, + "file": filepath, + "hash": h, + "embedding": embedding, + }) + processed += 1 + + # 寫出 JSON(只讀不動原檔案) + with open(OUTPUT_FILE, "w", encoding="utf-8") as f: + json.dump(results, f, ensure_ascii=False, indent=2) + + print(f"\n完成!") + print(f" 新增/更新: {processed} 篇") + print(f" 沿用舊的: {reused} 篇") + print(f" 跳過: {skipped} 篇(無 slug)") + print(f" 輸出至: {OUTPUT_FILE}") + + +if __name__ == "__main__": + main() diff --git a/blog_similar.py b/blog_similar.py new file mode 100644 index 0000000..85b72ad --- /dev/null +++ b/blog_similar.py @@ -0,0 +1,99 @@ +""" +Blog Similarity Finder +====================== +讀取 blog_embeddings.json,用模糊搜尋選一篇文章, +列出最相似的 10 篇和最不相似的 10 篇。 + +需求: + - pip install numpy iterfzf + - 已經跑過 blog_embeddings.py 產生 blog_embeddings.json +""" + +import json +import numpy as np +from iterfzf import iterfzf + +# ============================================================ +# 設定區 +# ============================================================ + +# embedding 檔案路徑(blog_embeddings.py 的輸出) +EMBEDDINGS_FILE = "./blog_embeddings.json" + +# ============================================================ + + +def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float: + """計算兩個向量的 cosine similarity。""" + dot = np.dot(a, b) + norm = np.linalg.norm(a) * np.linalg.norm(b) + if norm == 0: + return 0.0 + return float(dot / norm) + + +def format_row(rank: int, sim: float, title: str, slug: str) -> str: + """格式化一行結果。""" + bar = "█" * int(sim * 30) + return f" {rank:3d}. {sim:.4f} {bar} {title} ({slug})" + + +def main(): + # 讀取 embeddings + with open(EMBEDDINGS_FILE, encoding="utf-8") as f: + data = json.load(f) + + if not data: + print("找不到任何文章資料。") + return + + # 建立選項:title + slug,方便搜尋 + choices = [] + for item in data: + title = item.get("title", "(無標題)") + choices.append(f"{title} | {item['slug']}") + + # 模糊搜尋選擇 + selected = iterfzf(choices, prompt="選擇文章 > ") + + if not selected: + return + + idx = choices.index(selected) + target = data[idx] + target_vec = np.array(target["embedding"]) + target_title = target.get("title", "(無標題)") + + print(f"\n以「{target_title}」為基準:\n") + + # 計算與所有其他文章的相似度 + similarities = [] + for i, item in enumerate(data): + if i == idx: + continue + sim = cosine_similarity(target_vec, np.array(item["embedding"])) + similarities.append({ + "slug": item["slug"], + "title": item.get("title", "(無標題)"), + "similarity": sim, + }) + + # 由高到低排序 + similarities.sort(key=lambda x: x["similarity"], reverse=True) + + # 最相似 10 篇 + top = similarities[:20] + print("🔥 最相似的 20 篇:") + for rank, s in enumerate(top, 1): + print(format_row(rank, s["similarity"], s["title"], s["slug"])) + + # 最不相似 10 篇 + bottom = similarities[-5:] + bottom.reverse() + print(f"\n🧊 最不相似的 5 篇:") + for rank, s in enumerate(bottom, 1): + print(format_row(rank, s["similarity"], s["title"], s["slug"])) + + +if __name__ == "__main__": + main()