This repository has no description
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 940 B View raw
1"""Content-based dedup: collapse fork READMEs that share identical text.""" 2 3from __future__ import annotations 4 5from hashlib import md5 6 7from app.types import Candidate 8 9 10def content_hash(content: str | None = None, *, content_sha500: str | None = None) -> str: 11 if content_sha500: 12 return content_sha500 13 return md5((content or "")[:500].encode("utf-8")).hexdigest() 14 15 16def row_content_hash(row: dict) -> str: 17 sha = row.get("content_sha500") 18 if isinstance(sha, str) and sha: 19 return sha 20 return content_hash(row.get("content")) 21 22 23def collapse_forks(candidates: list[Candidate]) -> list[Candidate]: 24 """Keep one candidate per content_hash — the one with the smallest distance.""" 25 best: dict[str, Candidate] = {} 26 for c in candidates: 27 prev = best.get(c.content_hash) 28 if prev is None or c.distance < prev.distance: 29 best[c.content_hash] = c 30 return list(best.values())