100 lines
2.7 KiB
Python
100 lines
2.7 KiB
Python
|
|
"""
|
|||
|
|
Blog Similarity Finder
|
|||
|
|
======================
|
|||
|
|
讀取 blog_embeddings.json,用模糊搜尋選一篇文章,
|
|||
|
|
列出最相似的 10 篇和最不相似的 10 篇。
|
|||
|
|
|
|||
|
|
需求:
|
|||
|
|
- pip install numpy iterfzf
|
|||
|
|
- 已經跑過 blog_embeddings.py 產生 blog_embeddings.json
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import numpy as np
|
|||
|
|
from iterfzf import iterfzf
|
|||
|
|
|
|||
|
|
# ============================================================
|
|||
|
|
# 設定區
|
|||
|
|
# ============================================================
|
|||
|
|
|
|||
|
|
# embedding 檔案路徑(blog_embeddings.py 的輸出)
|
|||
|
|
EMBEDDINGS_FILE = "./blog_embeddings.json"
|
|||
|
|
|
|||
|
|
# ============================================================
|
|||
|
|
|
|||
|
|
|
|||
|
|
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
|||
|
|
"""計算兩個向量的 cosine similarity。"""
|
|||
|
|
dot = np.dot(a, b)
|
|||
|
|
norm = np.linalg.norm(a) * np.linalg.norm(b)
|
|||
|
|
if norm == 0:
|
|||
|
|
return 0.0
|
|||
|
|
return float(dot / norm)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def format_row(rank: int, sim: float, title: str, slug: str) -> str:
|
|||
|
|
"""格式化一行結果。"""
|
|||
|
|
bar = "█" * int(sim * 30)
|
|||
|
|
return f" {rank:3d}. {sim:.4f} {bar} {title} ({slug})"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
# 讀取 embeddings
|
|||
|
|
with open(EMBEDDINGS_FILE, encoding="utf-8") as f:
|
|||
|
|
data = json.load(f)
|
|||
|
|
|
|||
|
|
if not data:
|
|||
|
|
print("找不到任何文章資料。")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# 建立選項:title + slug,方便搜尋
|
|||
|
|
choices = []
|
|||
|
|
for item in data:
|
|||
|
|
title = item.get("title", "(無標題)")
|
|||
|
|
choices.append(f"{title} | {item['slug']}")
|
|||
|
|
|
|||
|
|
# 模糊搜尋選擇
|
|||
|
|
selected = iterfzf(choices, prompt="選擇文章 > ")
|
|||
|
|
|
|||
|
|
if not selected:
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
idx = choices.index(selected)
|
|||
|
|
target = data[idx]
|
|||
|
|
target_vec = np.array(target["embedding"])
|
|||
|
|
target_title = target.get("title", "(無標題)")
|
|||
|
|
|
|||
|
|
print(f"\n以「{target_title}」為基準:\n")
|
|||
|
|
|
|||
|
|
# 計算與所有其他文章的相似度
|
|||
|
|
similarities = []
|
|||
|
|
for i, item in enumerate(data):
|
|||
|
|
if i == idx:
|
|||
|
|
continue
|
|||
|
|
sim = cosine_similarity(target_vec, np.array(item["embedding"]))
|
|||
|
|
similarities.append({
|
|||
|
|
"slug": item["slug"],
|
|||
|
|
"title": item.get("title", "(無標題)"),
|
|||
|
|
"similarity": sim,
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 由高到低排序
|
|||
|
|
similarities.sort(key=lambda x: x["similarity"], reverse=True)
|
|||
|
|
|
|||
|
|
# 最相似 10 篇
|
|||
|
|
top = similarities[:20]
|
|||
|
|
print("🔥 最相似的 20 篇:")
|
|||
|
|
for rank, s in enumerate(top, 1):
|
|||
|
|
print(format_row(rank, s["similarity"], s["title"], s["slug"]))
|
|||
|
|
|
|||
|
|
# 最不相似 10 篇
|
|||
|
|
bottom = similarities[-5:]
|
|||
|
|
bottom.reverse()
|
|||
|
|
print(f"\n🧊 最不相似的 5 篇:")
|
|||
|
|
for rank, s in enumerate(bottom, 1):
|
|||
|
|
print(format_row(rank, s["similarity"], s["title"], s["slug"]))
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|