Files
blog-embedding/blog_embedding.py

156 lines
4.3 KiB
Python
Raw Permalink Normal View History

2026-03-26 02:08:22 +00:00
"""
Blog Embeddings Generator
=========================
遍歷 Docusaurus blog 資料夾為每篇文章產生 embedding 向量
連同 slug 一起存成 JSON
支援增量模式如果 JSON 已存在只處理新增或內容有變動的文章
--full 強制全部重跑
需求
- ollama 已啟動且已 pull qwen3-embedding:8b
- pip install pyyaml requests
"""
import os
import sys
import glob
import json
import hashlib
import yaml
import requests
import time
# ============================================================
# 設定區
# ============================================================
# 你的 Docusaurus blog 資料夾路徑
2026-03-26 02:15:12 +00:00
BLOG_DIR = "./blog"
2026-03-26 02:08:22 +00:00
# 輸出檔案路徑
OUTPUT_FILE = "./blog_embeddings.json"
# Ollama 設定
OLLAMA_URL = "http://localhost:11434/api/embed"
OLLAMA_MODEL = "qwen3-embedding:8b"
# ============================================================
def extract_meta(content: str, filepath: str) -> tuple[str | None, str | None]:
"""從 frontmatter 抽出 slug 和 title。"""
if not content.startswith("---"):
return None, None
parts = content.split("---", 2)
if len(parts) < 3:
return None, None
try:
meta = yaml.safe_load(parts[1])
if isinstance(meta, dict):
return meta.get("slug"), meta.get("title")
except yaml.YAMLError:
print(f" ⚠ YAML 解析失敗: {filepath}")
return None, None
def content_hash(content: str) -> str:
"""算出內容的 hash用來判斷文章有沒有改過。"""
return hashlib.sha256(content.encode("utf-8")).hexdigest()
def get_embedding(text: str) -> list[float]:
"""透過 Ollama API 取得 embedding 向量。"""
resp = requests.post(OLLAMA_URL, json={
"model": OLLAMA_MODEL,
"input": text,
})
resp.raise_for_status()
return resp.json()["embeddings"][0]
def load_existing(path: str) -> list[dict]:
"""讀取現有的 JSON沒有就回空 list。"""
if not os.path.exists(path):
return []
with open(path, encoding="utf-8") as f:
return json.load(f)
def main():
full_mode = "--full" in sys.argv
# 找出所有 .md / .mdx 檔
patterns = [
os.path.join(BLOG_DIR, "**", "*.md"),
os.path.join(BLOG_DIR, "**", "*.mdx"),
]
files = []
for p in patterns:
files.extend(glob.glob(p, recursive=True))
files = sorted(set(files))
print(f"找到 {len(files)} 個 md/mdx 檔案")
# 載入現有資料,建立 file -> entry 的索引
if full_mode:
existing = {}
print("模式:完整重建\n")
else:
existing_list = load_existing(OUTPUT_FILE)
existing = {item["file"]: item for item in existing_list}
print(f"模式:增量更新(現有 {len(existing)} 篇)\n")
results = []
skipped = 0
reused = 0
processed = 0
for i, filepath in enumerate(files, 1):
with open(filepath, encoding="utf-8") as f:
content = f.read()
slug, title = extract_meta(content, filepath)
if not slug:
print(f"[{i}/{len(files)}] 跳過(無 slug: {filepath}")
skipped += 1
continue
h = content_hash(content)
# 如果檔案已存在且 hash 沒變,直接沿用
if filepath in existing and existing[filepath].get("hash") == h:
print(f"[{i}/{len(files)}] 沒變,沿用: {slug}")
results.append(existing[filepath])
reused += 1
continue
print(f"[{i}/{len(files)}] 處理中: {slug}")
start = time.time()
embedding = get_embedding(content)
elapsed = time.time() - start
print(f"{len(embedding)} 維,耗時 {elapsed:.1f}s")
results.append({
"slug": slug,
"title": title,
"file": filepath,
"hash": h,
"embedding": embedding,
})
processed += 1
# 寫出 JSON只讀不動原檔案
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\n完成!")
print(f" 新增/更新: {processed}")
print(f" 沿用舊的: {reused}")
print(f" 跳過: {skipped} 篇(無 slug")
print(f" 輸出至: {OUTPUT_FILE}")
if __name__ == "__main__":
main()