#!/usr/bin/env python3 import gzip import hashlib import re import sys import urllib.request from pathlib import Path from typing import Iterator WARC_VERSION = re.compile(rb'^WARC/') FILE_SIZE = re.compile(r'meta itemprop=fileSize content="(\d+)"', re.IGNORECASE) class WARCParser: def __init__(self, filepath: Path): self.filepath = Path(filepath) def _open_file(self): return gzip.open(self.filepath, 'rb') if self.filepath.suffix == '.gz' else open(self.filepath, 'rb') def _read_headers(self, file) -> dict[str, str]: headers = {} while line := file.readline(): if not line.strip(): break if b':' in line: key, _, value = line.partition(b':') headers[key.decode('utf-8', errors='replace').strip()] = value.decode('utf-8', errors='replace').strip() return headers def _read_content(self, file, length: int) -> tuple[str, str]: if length <= 0: return '', '' content = file.read(length) for sep in (b'\r\n\r\n', b'\n\n'): if sep in content: headers, body = content.split(sep, 1) return headers.decode('utf-8', errors='replace'), body.decode('utf-8', errors='replace') return content.decode('utf-8', errors='replace'), '' def parse_records(self) -> Iterator[tuple[dict, str, str]]: with self._open_file() as file: while line := file.readline(): if not WARC_VERSION.match(line): continue warc_headers = self._read_headers(file) http_headers, body = self._read_content(file, int(warc_headers.get('Content-Length', 0))) yield (warc_headers, http_headers, body) def get_html_bodies(self) -> Iterator[tuple[str, str]]: seen = set() for warc_headers, http_headers, body in self.parse_records(): if warc_headers.get('WARC-Type') != 'response': continue uri = warc_headers.get('WARC-Target-URI', '') if uri in seen or ('HTTP/1.1 200' not in http_headers and 'HTTP/1.0 200' not in http_headers): continue seen.add(uri) yield (uri, body) def download_file(url: str, dest: Path, md5: str = None) -> None: if dest.exists() and md5: if hashlib.md5(dest.read_bytes()).hexdigest() == md5: return dest.unlink() urllib.request.urlretrieve(url, dest) def main(): warc_file = Path('warc.gz') download_file( 'https://archive.org/download/archiveteam_archivebot_go_20251003184131_b441438a/www.oldversion.com-inf-20251003-161230-auyuj-00000.warc.gz', warc_file, 'cd26d40b8a9a0a9b1a7ac274b3b78ceb' ) total_kb = 0.0 count = 0 for url, html in WARCParser(warc_file).get_html_bodies(): if not url.startswith("http://www.oldversion.com/"): continue parts = url.split("/") if len(parts) < 4 or parts[3] not in ('windows', 'linux', 'macos') or 'fileSize' not in html: continue for match in FILE_SIZE.finditer(html): kb = int(match.group(1)) / 1024 print(f"{url}: {kb:.2f} KB") total_kb += kb count += 1 print(f"\n{'='*60}") print(f"Files: {count} | Total: {total_kb:.2f} KB ({total_kb/1024:.2f} MB, {total_kb/1048576:.2f} GB)") print(f"{'='*60}") if __name__ == "__main__": main()