Files
blog-embedding/blog_embedding.py
2026-03-26 02:15:12 +00:00

156 lines
4.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Blog Embeddings Generator
=========================
遍歷 Docusaurus blog 資料夾,為每篇文章產生 embedding 向量,
連同 slug 一起存成 JSON 檔。
支援增量模式:如果 JSON 已存在,只處理新增或內容有變動的文章。
用 --full 強制全部重跑。
需求:
- ollama 已啟動且已 pull qwen3-embedding:8b
- pip install pyyaml requests
"""
import os
import sys
import glob
import json
import hashlib
import yaml
import requests
import time
# ============================================================
# 設定區
# ============================================================
# 你的 Docusaurus blog 資料夾路徑
BLOG_DIR = "./blog"
# 輸出檔案路徑
OUTPUT_FILE = "./blog_embeddings.json"
# Ollama 設定
OLLAMA_URL = "http://localhost:11434/api/embed"
OLLAMA_MODEL = "qwen3-embedding:8b"
# ============================================================
def extract_meta(content: str, filepath: str) -> tuple[str | None, str | None]:
"""從 frontmatter 抽出 slug 和 title。"""
if not content.startswith("---"):
return None, None
parts = content.split("---", 2)
if len(parts) < 3:
return None, None
try:
meta = yaml.safe_load(parts[1])
if isinstance(meta, dict):
return meta.get("slug"), meta.get("title")
except yaml.YAMLError:
print(f" ⚠ YAML 解析失敗: {filepath}")
return None, None
def content_hash(content: str) -> str:
"""算出內容的 hash用來判斷文章有沒有改過。"""
return hashlib.sha256(content.encode("utf-8")).hexdigest()
def get_embedding(text: str) -> list[float]:
"""透過 Ollama API 取得 embedding 向量。"""
resp = requests.post(OLLAMA_URL, json={
"model": OLLAMA_MODEL,
"input": text,
})
resp.raise_for_status()
return resp.json()["embeddings"][0]
def load_existing(path: str) -> list[dict]:
"""讀取現有的 JSON沒有就回空 list。"""
if not os.path.exists(path):
return []
with open(path, encoding="utf-8") as f:
return json.load(f)
def main():
full_mode = "--full" in sys.argv
# 找出所有 .md / .mdx 檔
patterns = [
os.path.join(BLOG_DIR, "**", "*.md"),
os.path.join(BLOG_DIR, "**", "*.mdx"),
]
files = []
for p in patterns:
files.extend(glob.glob(p, recursive=True))
files = sorted(set(files))
print(f"找到 {len(files)} 個 md/mdx 檔案")
# 載入現有資料,建立 file -> entry 的索引
if full_mode:
existing = {}
print("模式:完整重建\n")
else:
existing_list = load_existing(OUTPUT_FILE)
existing = {item["file"]: item for item in existing_list}
print(f"模式:增量更新(現有 {len(existing)} 篇)\n")
results = []
skipped = 0
reused = 0
processed = 0
for i, filepath in enumerate(files, 1):
with open(filepath, encoding="utf-8") as f:
content = f.read()
slug, title = extract_meta(content, filepath)
if not slug:
print(f"[{i}/{len(files)}] 跳過(無 slug: {filepath}")
skipped += 1
continue
h = content_hash(content)
# 如果檔案已存在且 hash 沒變,直接沿用
if filepath in existing and existing[filepath].get("hash") == h:
print(f"[{i}/{len(files)}] 沒變,沿用: {slug}")
results.append(existing[filepath])
reused += 1
continue
print(f"[{i}/{len(files)}] 處理中: {slug}")
start = time.time()
embedding = get_embedding(content)
elapsed = time.time() - start
print(f"{len(embedding)} 維,耗時 {elapsed:.1f}s")
results.append({
"slug": slug,
"title": title,
"file": filepath,
"hash": h,
"embedding": embedding,
})
processed += 1
# 寫出 JSON只讀不動原檔案
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\n完成!")
print(f" 新增/更新: {processed}")
print(f" 沿用舊的: {reused}")
print(f" 跳過: {skipped} 篇(無 slug")
print(f" 輸出至: {OUTPUT_FILE}")
if __name__ == "__main__":
main()