Files
blog-embedding/blog_similar.py
2026-03-26 02:08:22 +00:00

100 lines
2.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Blog Similarity Finder
======================
讀取 blog_embeddings.json用模糊搜尋選一篇文章
列出最相似的 10 篇和最不相似的 10 篇。
需求:
- pip install numpy iterfzf
- 已經跑過 blog_embeddings.py 產生 blog_embeddings.json
"""
import json
import numpy as np
from iterfzf import iterfzf
# ============================================================
# 設定區
# ============================================================
# embedding 檔案路徑blog_embeddings.py 的輸出)
EMBEDDINGS_FILE = "./blog_embeddings.json"
# ============================================================
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""計算兩個向量的 cosine similarity。"""
dot = np.dot(a, b)
norm = np.linalg.norm(a) * np.linalg.norm(b)
if norm == 0:
return 0.0
return float(dot / norm)
def format_row(rank: int, sim: float, title: str, slug: str) -> str:
"""格式化一行結果。"""
bar = "" * int(sim * 30)
return f" {rank:3d}. {sim:.4f} {bar} {title} ({slug})"
def main():
# 讀取 embeddings
with open(EMBEDDINGS_FILE, encoding="utf-8") as f:
data = json.load(f)
if not data:
print("找不到任何文章資料。")
return
# 建立選項title + slug方便搜尋
choices = []
for item in data:
title = item.get("title", "(無標題)")
choices.append(f"{title} | {item['slug']}")
# 模糊搜尋選擇
selected = iterfzf(choices, prompt="選擇文章 > ")
if not selected:
return
idx = choices.index(selected)
target = data[idx]
target_vec = np.array(target["embedding"])
target_title = target.get("title", "(無標題)")
print(f"\n以「{target_title}」為基準:\n")
# 計算與所有其他文章的相似度
similarities = []
for i, item in enumerate(data):
if i == idx:
continue
sim = cosine_similarity(target_vec, np.array(item["embedding"]))
similarities.append({
"slug": item["slug"],
"title": item.get("title", "(無標題)"),
"similarity": sim,
})
# 由高到低排序
similarities.sort(key=lambda x: x["similarity"], reverse=True)
# 最相似 10 篇
top = similarities[:20]
print("🔥 最相似的 20 篇:")
for rank, s in enumerate(top, 1):
print(format_row(rank, s["similarity"], s["title"], s["slug"]))
# 最不相似 10 篇
bottom = similarities[-5:]
bottom.reverse()
print(f"\n🧊 最不相似的 5 篇:")
for rank, s in enumerate(bottom, 1):
print(format_row(rank, s["similarity"], s["title"], s["slug"]))
if __name__ == "__main__":
main()