100 lines
2.7 KiB
Python
100 lines
2.7 KiB
Python
"""
|
||
Blog Similarity Finder
|
||
======================
|
||
讀取 blog_embeddings.json,用模糊搜尋選一篇文章,
|
||
列出最相似的 10 篇和最不相似的 10 篇。
|
||
|
||
需求:
|
||
- pip install numpy iterfzf
|
||
- 已經跑過 blog_embeddings.py 產生 blog_embeddings.json
|
||
"""
|
||
|
||
import json
|
||
import numpy as np
|
||
from iterfzf import iterfzf
|
||
|
||
# ============================================================
|
||
# 設定區
|
||
# ============================================================
|
||
|
||
# embedding 檔案路徑(blog_embeddings.py 的輸出)
|
||
EMBEDDINGS_FILE = "./blog_embeddings.json"
|
||
|
||
# ============================================================
|
||
|
||
|
||
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
||
"""計算兩個向量的 cosine similarity。"""
|
||
dot = np.dot(a, b)
|
||
norm = np.linalg.norm(a) * np.linalg.norm(b)
|
||
if norm == 0:
|
||
return 0.0
|
||
return float(dot / norm)
|
||
|
||
|
||
def format_row(rank: int, sim: float, title: str, slug: str) -> str:
|
||
"""格式化一行結果。"""
|
||
bar = "█" * int(sim * 30)
|
||
return f" {rank:3d}. {sim:.4f} {bar} {title} ({slug})"
|
||
|
||
|
||
def main():
|
||
# 讀取 embeddings
|
||
with open(EMBEDDINGS_FILE, encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
|
||
if not data:
|
||
print("找不到任何文章資料。")
|
||
return
|
||
|
||
# 建立選項:title + slug,方便搜尋
|
||
choices = []
|
||
for item in data:
|
||
title = item.get("title", "(無標題)")
|
||
choices.append(f"{title} | {item['slug']}")
|
||
|
||
# 模糊搜尋選擇
|
||
selected = iterfzf(choices, prompt="選擇文章 > ")
|
||
|
||
if not selected:
|
||
return
|
||
|
||
idx = choices.index(selected)
|
||
target = data[idx]
|
||
target_vec = np.array(target["embedding"])
|
||
target_title = target.get("title", "(無標題)")
|
||
|
||
print(f"\n以「{target_title}」為基準:\n")
|
||
|
||
# 計算與所有其他文章的相似度
|
||
similarities = []
|
||
for i, item in enumerate(data):
|
||
if i == idx:
|
||
continue
|
||
sim = cosine_similarity(target_vec, np.array(item["embedding"]))
|
||
similarities.append({
|
||
"slug": item["slug"],
|
||
"title": item.get("title", "(無標題)"),
|
||
"similarity": sim,
|
||
})
|
||
|
||
# 由高到低排序
|
||
similarities.sort(key=lambda x: x["similarity"], reverse=True)
|
||
|
||
# 最相似 10 篇
|
||
top = similarities[:20]
|
||
print("🔥 最相似的 20 篇:")
|
||
for rank, s in enumerate(top, 1):
|
||
print(format_row(rank, s["similarity"], s["title"], s["slug"]))
|
||
|
||
# 最不相似 10 篇
|
||
bottom = similarities[-5:]
|
||
bottom.reverse()
|
||
print(f"\n🧊 最不相似的 5 篇:")
|
||
for rank, s in enumerate(bottom, 1):
|
||
print(format_row(rank, s["similarity"], s["title"], s["slug"]))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|