Files
blog-embedding/blog_similar.py

100 lines
2.7 KiB
Python
Raw Normal View History

2026-03-26 02:08:22 +00:00
"""
Blog Similarity Finder
======================
讀取 blog_embeddings.json用模糊搜尋選一篇文章
列出最相似的 10 篇和最不相似的 10
需求
- pip install numpy iterfzf
- 已經跑過 blog_embeddings.py 產生 blog_embeddings.json
"""
import json
import numpy as np
from iterfzf import iterfzf
# ============================================================
# 設定區
# ============================================================
# embedding 檔案路徑blog_embeddings.py 的輸出)
EMBEDDINGS_FILE = "./blog_embeddings.json"
# ============================================================
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""計算兩個向量的 cosine similarity。"""
dot = np.dot(a, b)
norm = np.linalg.norm(a) * np.linalg.norm(b)
if norm == 0:
return 0.0
return float(dot / norm)
def format_row(rank: int, sim: float, title: str, slug: str) -> str:
"""格式化一行結果。"""
bar = "" * int(sim * 30)
return f" {rank:3d}. {sim:.4f} {bar} {title} ({slug})"
def main():
# 讀取 embeddings
with open(EMBEDDINGS_FILE, encoding="utf-8") as f:
data = json.load(f)
if not data:
print("找不到任何文章資料。")
return
# 建立選項title + slug方便搜尋
choices = []
for item in data:
title = item.get("title", "(無標題)")
choices.append(f"{title} | {item['slug']}")
# 模糊搜尋選擇
selected = iterfzf(choices, prompt="選擇文章 > ")
if not selected:
return
idx = choices.index(selected)
target = data[idx]
target_vec = np.array(target["embedding"])
target_title = target.get("title", "(無標題)")
print(f"\n以「{target_title}」為基準:\n")
# 計算與所有其他文章的相似度
similarities = []
for i, item in enumerate(data):
if i == idx:
continue
sim = cosine_similarity(target_vec, np.array(item["embedding"]))
similarities.append({
"slug": item["slug"],
"title": item.get("title", "(無標題)"),
"similarity": sim,
})
# 由高到低排序
similarities.sort(key=lambda x: x["similarity"], reverse=True)
# 最相似 10 篇
top = similarities[:20]
print("🔥 最相似的 20 篇:")
for rank, s in enumerate(top, 1):
print(format_row(rank, s["similarity"], s["title"], s["slug"]))
# 最不相似 10 篇
bottom = similarities[-5:]
bottom.reverse()
print(f"\n🧊 最不相似的 5 篇:")
for rank, s in enumerate(bottom, 1):
print(format_row(rank, s["similarity"], s["title"], s["slug"]))
if __name__ == "__main__":
main()