import os
import re
from pathlib import Path
from collections import defaultdict

# Set your root directory
root = Path("/home/josefkulovany/")

# Supported web file extensions
page_exts = {".html", ".php"}
img_exts = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".bmp", ".tiff"}

# Image src pattern: <img src="..."> or url('...')
img_pattern = re.compile(r'src=["\']([^"\']+)["\']|url\(([^)]+)\)', re.IGNORECASE)

# Map from image path => list of pages referencing it
image_references = defaultdict(set)

# Track all valid images on disk (absolute and relative to root)
valid_images = set()

# Build list of valid image files
for img in root.rglob("*"):
    if img.suffix.lower() in img_exts and img.is_file():
        rel = img.relative_to(root)
        valid_images.add(str(rel))
        valid_images.add("/" + str(rel))  # Handle both formats

# Crawl all .html and .php files
for file in root.rglob("*"):
    if file.suffix.lower() in page_exts:
        content = file.read_text(errors="ignore")
        rel_page = str(file.relative_to(root))

        for match in img_pattern.findall(content):
            # Combine both capture groups
            img_src = match[0] or match[1]
            img_src = img_src.strip("\"' ")

            # Clean leading "./", "../", or quotes
            img_src = os.path.normpath(img_src.lstrip("./"))

            # Record reference
            image_references[img_src].add(rel_page)

# Build output: image references → valid / missing
missing = []
mapped = []

print("\n=== IMAGE REFERENCE MAPPING ===\n")
for img, pages in sorted(image_references.items()):
    found = img in valid_images
    pages_str = ", ".join(sorted(pages))

    if found:
        mapped.append((img, pages))
        print(f"[FOUND]   {img}\n          ↳ {pages_str}\n")
    else:
        missing.append((img, pages))
        print(f"[MISSING] {img}\n          ↳ {pages_str}\n")

print("=== SUMMARY ===")
print(f"✓ Total valid images found:   {len(mapped)}")
print(f"✗ Total missing/orphaned:     {len(missing)}")
print(f"🔎 Total image references:     {len(image_references)}")

# Optionally: write results to a report file
with open("image_mapping_report.txt", "w") as f:
    for img, pages in sorted(mapped):
        f.write(f"[FOUND] {img} -> {', '.join(sorted(pages))}\n")
    for img, pages in sorted(missing):
        f.write(f"[MISSING] {img} -> {', '.join(sorted(pages))}\n")
