Files
blocklists/scripts/merge_blocklists.py
T
CodeX 14fd0cf511 Simplify to whitelist/blacklist model
- Rewrite merge_blocklists.py to sync a single blacklist from upstream
  and subtract the locally-maintained whitelist
- Replace whitelist contents with subtitle + webm seed
- Remove blacklist_permissive, whitelist_with_subtitles, and all
  .prev files that are no longer needed
- Rewrite README to reflect the two-file model and link to wiki
2026-04-07 01:09:17 +02:00

68 lines
2.4 KiB
Python

"""Sync the blacklist from upstream Cleanuparr, preserving manual local
additions and stripping entries listed in the locally-maintained whitelist.
See the wiki (Sync) for the full algorithm and rationale.
"""
import urllib.request
UPSTREAM_URL = "https://raw.githubusercontent.com/Cleanuparr/Cleanuparr/main/blacklist"
BLACKLIST = "blacklist"
BLACKLIST_PREV = "blacklist.prev"
WHITELIST = "whitelist"
def read_lines(path):
"""Read a file into a set of non-empty stripped lines. Empty set if missing."""
try:
with open(path) as f:
return set(line.strip() for line in f if line.strip())
except FileNotFoundError:
return set()
def main():
# Fetch the current upstream blacklist
with urllib.request.urlopen(UPSTREAM_URL) as r:
upstream_new = set(
line.strip() for line in r.read().decode().splitlines() if line.strip()
)
# Previous upstream snapshot: baseline for detecting local additions.
# On first run (no snapshot on disk), use the current upstream as the
# baseline so nothing is treated as a local addition.
upstream_prev = read_lines(BLACKLIST_PREV)
if not upstream_prev:
upstream_prev = upstream_new.copy()
# Current committed blacklist (may contain manual local additions)
local = read_lines(BLACKLIST)
# Locally-maintained whitelist (exclusion source)
whitelist = read_lines(WHITELIST)
# Three-way merge: anything in local but not in the previous upstream
# snapshot is a manual local addition that must be preserved.
custom = local - upstream_prev
merged = upstream_new | custom
# Strip whitelist entries from the merged result.
result = merged - whitelist
# Reporting for the workflow log
print(f"[{BLACKLIST}] Upstream added: {sorted(upstream_new - upstream_prev)}")
print(f"[{BLACKLIST}] Upstream removed: {sorted(upstream_prev - upstream_new)}")
print(f"[{BLACKLIST}] Custom preserved: {sorted(custom)}")
print(f"[{BLACKLIST}] Whitelist stripped: {sorted(merged & whitelist)}")
# Write the merged blacklist, sorted for deterministic diffs
with open(BLACKLIST, "w") as f:
f.write("\n".join(sorted(result)) + "\n")
# Store the new upstream snapshot for the next run
with open(BLACKLIST_PREV, "w") as f:
f.write("\n".join(sorted(upstream_new)) + "\n")
if __name__ == "__main__":
main()