上傳檔案到「/」

2026-03-26 02:08:22 +00:00
parent 2e367b76a4
commit a74875fecc
2 changed files with 254 additions and 0 deletions
--- a/blog_embedding.py
+++ b/blog_embedding.py
@@ -0,0 +1,155 @@
+"""
+Blog Embeddings Generator
+=========================
+遍歷 Docusaurus blog 資料夾，為每篇文章產生 embedding 向量，
+連同 slug 一起存成 JSON 檔。
+
+支援增量模式：如果 JSON 已存在，只處理新增或內容有變動的文章。
+用 --full 強制全部重跑。
+
+需求：
+  - ollama 已啟動且已 pull qwen3-embedding:8b
+  - pip install pyyaml requests
+"""
+
+import os
+import sys
+import glob
+import json
+import hashlib
+import yaml
+import requests
+import time
+
+# ============================================================
+# 設定區
+# ============================================================
+
+# 你的 Docusaurus blog 資料夾路徑
+BLOG_DIR = "/home/wiwi/Syncthing/WiwiWisdom/blog"
+
+# 輸出檔案路徑
+OUTPUT_FILE = "./blog_embeddings.json"
+
+# Ollama 設定
+OLLAMA_URL = "http://localhost:11434/api/embed"
+OLLAMA_MODEL = "qwen3-embedding:8b"
+
+# ============================================================
+
+
+def extract_meta(content: str, filepath: str) -> tuple[str | None, str | None]:
+    """從 frontmatter 抽出 slug 和 title。"""
+    if not content.startswith("---"):
+        return None, None
+    parts = content.split("---", 2)
+    if len(parts) < 3:
+        return None, None
+    try:
+        meta = yaml.safe_load(parts[1])
+        if isinstance(meta, dict):
+            return meta.get("slug"), meta.get("title")
+    except yaml.YAMLError:
+        print(f"  ⚠ YAML 解析失敗: {filepath}")
+    return None, None
+
+
+def content_hash(content: str) -> str:
+    """算出內容的 hash，用來判斷文章有沒有改過。"""
+    return hashlib.sha256(content.encode("utf-8")).hexdigest()
+
+
+def get_embedding(text: str) -> list[float]:
+    """透過 Ollama API 取得 embedding 向量。"""
+    resp = requests.post(OLLAMA_URL, json={
+        "model": OLLAMA_MODEL,
+        "input": text,
+    })
+    resp.raise_for_status()
+    return resp.json()["embeddings"][0]
+
+
+def load_existing(path: str) -> list[dict]:
+    """讀取現有的 JSON，沒有就回空 list。"""
+    if not os.path.exists(path):
+        return []
+    with open(path, encoding="utf-8") as f:
+        return json.load(f)
+
+
+def main():
+    full_mode = "--full" in sys.argv
+
+    # 找出所有 .md / .mdx 檔
+    patterns = [
+        os.path.join(BLOG_DIR, "**", "*.md"),
+        os.path.join(BLOG_DIR, "**", "*.mdx"),
+    ]
+    files = []
+    for p in patterns:
+        files.extend(glob.glob(p, recursive=True))
+    files = sorted(set(files))
+
+    print(f"找到 {len(files)} 個 md/mdx 檔案")
+
+    # 載入現有資料，建立 file -> entry 的索引
+    if full_mode:
+        existing = {}
+        print("模式：完整重建\n")
+    else:
+        existing_list = load_existing(OUTPUT_FILE)
+        existing = {item["file"]: item for item in existing_list}
+        print(f"模式：增量更新（現有 {len(existing)} 篇）\n")
+
+    results = []
+    skipped = 0
+    reused = 0
+    processed = 0
+
+    for i, filepath in enumerate(files, 1):
+        with open(filepath, encoding="utf-8") as f:
+            content = f.read()
+
+        slug, title = extract_meta(content, filepath)
+        if not slug:
+            print(f"[{i}/{len(files)}] 跳過（無 slug）: {filepath}")
+            skipped += 1
+            continue
+
+        h = content_hash(content)
+
+        # 如果檔案已存在且 hash 沒變，直接沿用
+        if filepath in existing and existing[filepath].get("hash") == h:
+            print(f"[{i}/{len(files)}] 沒變，沿用: {slug}")
+            results.append(existing[filepath])
+            reused += 1
+            continue
+
+        print(f"[{i}/{len(files)}] 處理中: {slug}")
+        start = time.time()
+        embedding = get_embedding(content)
+        elapsed = time.time() - start
+        print(f"  ✓ {len(embedding)} 維，耗時 {elapsed:.1f}s")
+
+        results.append({
+            "slug": slug,
+            "title": title,
+            "file": filepath,
+            "hash": h,
+            "embedding": embedding,
+        })
+        processed += 1
+
+    # 寫出 JSON（只讀不動原檔案）
+    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+
+    print(f"\n完成！")
+    print(f"  新增/更新: {processed} 篇")
+    print(f"  沿用舊的: {reused} 篇")
+    print(f"  跳過:     {skipped} 篇（無 slug）")
+    print(f"  輸出至:   {OUTPUT_FILE}")
+
+
+if __name__ == "__main__":
+    main()
--- a/blog_similar.py
+++ b/blog_similar.py
@@ -0,0 +1,99 @@
+"""
+Blog Similarity Finder
+======================
+讀取 blog_embeddings.json，用模糊搜尋選一篇文章，
+列出最相似的 10 篇和最不相似的 10 篇。
+
+需求：
+  - pip install numpy iterfzf
+  - 已經跑過 blog_embeddings.py 產生 blog_embeddings.json
+"""
+
+import json
+import numpy as np
+from iterfzf import iterfzf
+
+# ============================================================
+# 設定區
+# ============================================================
+
+# embedding 檔案路徑（blog_embeddings.py 的輸出）
+EMBEDDINGS_FILE = "./blog_embeddings.json"
+
+# ============================================================
+
+
+def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+    """計算兩個向量的 cosine similarity。"""
+    dot = np.dot(a, b)
+    norm = np.linalg.norm(a) * np.linalg.norm(b)
+    if norm == 0:
+        return 0.0
+    return float(dot / norm)
+
+
+def format_row(rank: int, sim: float, title: str, slug: str) -> str:
+    """格式化一行結果。"""
+    bar = "█" * int(sim * 30)
+    return f"  {rank:3d}. {sim:.4f}  {bar}  {title}  ({slug})"
+
+
+def main():
+    # 讀取 embeddings
+    with open(EMBEDDINGS_FILE, encoding="utf-8") as f:
+        data = json.load(f)
+
+    if not data:
+        print("找不到任何文章資料。")
+        return
+
+    # 建立選項：title + slug，方便搜尋
+    choices = []
+    for item in data:
+        title = item.get("title", "(無標題)")
+        choices.append(f"{title}  |  {item['slug']}")
+
+    # 模糊搜尋選擇
+    selected = iterfzf(choices, prompt="選擇文章 > ")
+
+    if not selected:
+        return
+
+    idx = choices.index(selected)
+    target = data[idx]
+    target_vec = np.array(target["embedding"])
+    target_title = target.get("title", "(無標題)")
+
+    print(f"\n以「{target_title}」為基準：\n")
+
+    # 計算與所有其他文章的相似度
+    similarities = []
+    for i, item in enumerate(data):
+        if i == idx:
+            continue
+        sim = cosine_similarity(target_vec, np.array(item["embedding"]))
+        similarities.append({
+            "slug": item["slug"],
+            "title": item.get("title", "(無標題)"),
+            "similarity": sim,
+        })
+
+    # 由高到低排序
+    similarities.sort(key=lambda x: x["similarity"], reverse=True)
+
+    # 最相似 10 篇
+    top = similarities[:20]
+    print("🔥 最相似的 20 篇：")
+    for rank, s in enumerate(top, 1):
+        print(format_row(rank, s["similarity"], s["title"], s["slug"]))
+
+    # 最不相似 10 篇
+    bottom = similarities[-5:]
+    bottom.reverse()
+    print(f"\n🧊 最不相似的 5 篇：")
+    for rank, s in enumerate(bottom, 1):
+        print(format_row(rank, s["similarity"], s["title"], s["slug"]))
+
+
+if __name__ == "__main__":
+    main()