""" Blog Similarity Finder ====================== 讀取 blog_embeddings.json,用模糊搜尋選一篇文章, 列出最相似的 10 篇和最不相似的 10 篇。 需求: - pip install numpy iterfzf - 已經跑過 blog_embeddings.py 產生 blog_embeddings.json """ import json import numpy as np from iterfzf import iterfzf # ============================================================ # 設定區 # ============================================================ # embedding 檔案路徑(blog_embeddings.py 的輸出) EMBEDDINGS_FILE = "./blog_embeddings.json" # ============================================================ def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float: """計算兩個向量的 cosine similarity。""" dot = np.dot(a, b) norm = np.linalg.norm(a) * np.linalg.norm(b) if norm == 0: return 0.0 return float(dot / norm) def format_row(rank: int, sim: float, title: str, slug: str) -> str: """格式化一行結果。""" bar = "█" * int(sim * 30) return f" {rank:3d}. {sim:.4f} {bar} {title} ({slug})" def main(): # 讀取 embeddings with open(EMBEDDINGS_FILE, encoding="utf-8") as f: data = json.load(f) if not data: print("找不到任何文章資料。") return # 建立選項:title + slug,方便搜尋 choices = [] for item in data: title = item.get("title", "(無標題)") choices.append(f"{title} | {item['slug']}") # 模糊搜尋選擇 selected = iterfzf(choices, prompt="選擇文章 > ") if not selected: return idx = choices.index(selected) target = data[idx] target_vec = np.array(target["embedding"]) target_title = target.get("title", "(無標題)") print(f"\n以「{target_title}」為基準:\n") # 計算與所有其他文章的相似度 similarities = [] for i, item in enumerate(data): if i == idx: continue sim = cosine_similarity(target_vec, np.array(item["embedding"])) similarities.append({ "slug": item["slug"], "title": item.get("title", "(無標題)"), "similarity": sim, }) # 由高到低排序 similarities.sort(key=lambda x: x["similarity"], reverse=True) # 最相似 10 篇 top = similarities[:20] print("🔥 最相似的 20 篇:") for rank, s in enumerate(top, 1): print(format_row(rank, s["similarity"], s["title"], s["slug"])) # 最不相似 10 篇 bottom = similarities[-5:] bottom.reverse() print(f"\n🧊 最不相似的 5 篇:") for rank, s in enumerate(bottom, 1): print(format_row(rank, s["similarity"], s["title"], s["slug"])) if __name__ == "__main__": main()