import os
import re
import shutil
import requests
from pathlib import Path
from urllib.parse import urljoin, urlparse

# CONFIG
project_root = Path("/home/josefkulovany")  # your actual root
known_remote_prefixes = [
    "https://www.youtube.com", "https://fonts.googleapis.com", "https://fonts.gstatic.com"
]
fallback_dir = project_root / "_missing_assets"
fallback_dir.mkdir(exist_ok=True)

# TAGS and ATTRIBUTES to scan
asset_patterns = {
    "img": "src",
    "script": "src",
    "link": "href",
    "iframe": "src",
    "source": "src",
    "video": "src",
    "audio": "src",
    "object": "data",
}

# EXTENSIONS to look for
web_exts = [".html", ".php"]

# STORAGE
missing_assets = set()
symlinked = 0
downloaded = 0
placeheld = 0
existing = 0

def is_valid_url(link):
    return any(link.startswith(p) for p in known_remote_prefixes)

def extract_assets(html_text):
    assets = []
    for tag, attr in asset_patterns.items():
        matches = re.findall(fr'<{tag}[^>]*{attr}=["\'](.*?)["\']', html_text, re.IGNORECASE)
        assets.extend(matches)
    return assets

def try_find_local_match(filename):
    for path in project_root.rglob("*"):
        if path.name == filename and path.is_file():
            return path
    return None

def safe_download(url, dest):
    try:
        r = requests.get(url, timeout=5)
        r.raise_for_status()
        dest.write_bytes(r.content)
        return True
    except Exception as e:
        print(f"⚠ Failed to download {url} → {e}")
        return False

def create_placeholder(path):
    path.write_text("// placeholder file\n")

print("🔎 Scanning HTML and PHP files...\n")

for file in project_root.rglob("*"):
    if file.suffix.lower() not in web_exts or not file.is_file():
        continue

    html = file.read_text(errors="ignore")
    asset_paths = extract_assets(html)

    for ref in asset_paths:
        if not ref or ref.startswith("data:") or ref.startswith("#"):
            continue
        # Parse and normalize
        asset_path = urlparse(ref).path
        asset_path = asset_path.lstrip("/")  # always relative to web root
        asset_file = project_root / asset_path

        if asset_file.exists():
            existing += 1
            continue

        missing_assets.add(asset_path)
        asset_name = Path(asset_path).name
        asset_target_dir = project_root / os.path.dirname(asset_path)
        asset_target_dir.mkdir(parents=True, exist_ok=True)
        dest_path = asset_target_dir / asset_name

        # Try local match
        local = try_find_local_match(asset_name)
        if local:
            try:
                dest_path.symlink_to(local.resolve())
                print(f"🔗 Symlinked {dest_path} → {local}")
                symlinked += 1
                continue
            except Exception as e:
                print(f"❌ Symlink error: {e}")
        
        # Try remote download
        for remote_base in known_remote_prefixes:
            full_url = urljoin(remote_base + "/", asset_path)
            download_dest = fallback_dir / asset_name
            if safe_download(full_url, download_dest):
                try:
                    dest_path.symlink_to(download_dest.resolve())
                    print(f"🌐 Downloaded + Linked {dest_path} ← {full_url}")
                    downloaded += 1
                    break
                except Exception as e:
                    print(f"❌ Link fail after download: {e}")
                    continue
        else:
            # Final fallback: placeholder
            create_placeholder(dest_path)
            print(f"📄 Placeholder created for {dest_path}")
            placeheld += 1

print("\n✅ Done!")
print(f"✔ Existing assets found: {existing}")
print(f"🔗 Symlinks created: {symlinked}")
print(f"🌍 Files downloaded: {downloaded}")
print(f"📄 Placeholders created: {placeheld}")
print(f"❓ Total missing (unique): {len(missing_assets)}")
