This repository has no description
1"""Content-based dedup: collapse fork READMEs that share identical text."""
2
3from __future__ import annotations
4
5from hashlib import md5
6
7from app.types import Candidate
8
9
10def content_hash(content: str | None = None, *, content_sha500: str | None = None) -> str:
11 if content_sha500:
12 return content_sha500
13 return md5((content or "")[:500].encode("utf-8")).hexdigest()
14
15
16def row_content_hash(row: dict) -> str:
17 sha = row.get("content_sha500")
18 if isinstance(sha, str) and sha:
19 return sha
20 return content_hash(row.get("content"))
21
22
23def collapse_forks(candidates: list[Candidate]) -> list[Candidate]:
24 """Keep one candidate per content_hash — the one with the smallest distance."""
25 best: dict[str, Candidate] = {}
26 for c in candidates:
27 prev = best.get(c.content_hash)
28 if prev is None or c.distance < prev.distance:
29 best[c.content_hash] = c
30 return list(best.values())