import numpy as np
import pandas as pd
import re
from scipy.optimize import minimize
from tqdm import tqdm
from joblib import Parallel, delayed
import logging
import time
import matplotlib.pyplot as plt
import os

# Set up logging
logging.basicConfig(filename='symbolic_fit.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Extended primes list (up to 1000)
PRIMES = [
    2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71,
    73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151,
    157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233,
    239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317,
    331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419,
    421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503,
    509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599, 601, 607,
    613, 617, 619, 631, 641, 643, 647, 653, 659, 661, 673, 677, 683, 691, 701,
    709, 719, 727, 733, 739, 743, 751, 757, 761, 769, 773, 787, 797, 809, 811,
    821, 823, 827, 829, 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911,
    919, 929, 937, 941, 947, 953, 967, 971, 977, 983, 991, 997
]

phi = (1 + np.sqrt(5)) / 2
fib_cache = {}

def fib_real(n):
    if n in fib_cache:
        return fib_cache[n]
    if n > 100:
        return 0.0
    term1 = phi**n / np.sqrt(5)
    term2 = ((1/phi)**n) * np.cos(np.pi * n)
    result = term1 - term2
    fib_cache[n] = result
    return result

def D(n, beta, r=1.0, k=1.0, Omega=1.0, base=2, scale=1.0):
    try:
        Fn_beta = fib_real(n + beta)
        idx = int(np.floor(n + beta) + len(PRIMES)) % len(PRIMES)
        Pn_beta = PRIMES[idx]
        log_dyadic = (n + beta) * np.log(max(base, 1e-10))
        if log_dyadic > 700 or log_dyadic < -700:
            return None
        log_val = np.log(max(scale, 1e-30)) + np.log(phi) + np.log(max(abs(Fn_beta), 1e-30)) + log_dyadic + np.log(max(Omega, 1e-30))
        if n > 1000:
            log_val += np.log(np.log(n) / np.log(1000))
        if not np.isfinite(log_val):
            return None
        val = np.exp(log_val) * np.sign(Fn_beta)
        return np.sqrt(max(abs(val), 1e-30)) * (r ** k) * np.sign(val)
    except Exception as e:
        logging.error(f"D failed: n={n}, beta={beta}, error={e}")
        return None

def invert_D(value, r=1.0, k=1.0, Omega=1.0, base=2, scale=1.0, max_n=500):
    candidates = []
    log_val = np.log10(max(abs(value), 1e-30))
    max_n = min(1000, max(200, int(100 * abs(log_val) + 100)))
    n_values = np.linspace(0, max_n, 100)  # Increased resolution
    scale_factors = np.logspace(max(log_val - 10, -30), min(log_val + 10, 30), num=20)  # Wider scale range
    try:
        for n in tqdm(n_values, desc=f"invert_D for {value:.2e}", leave=False):
            for beta in np.linspace(0, 1, 10):
                for dynamic_scale in scale_factors:
                    for r_local in [0.1, 0.5, 1.0, 2.0, 5.0]:  # Expanded range
                        for k_local in [0.1, 0.5, 1.0, 2.0, 5.0]:
                            val = D(n, beta, r_local, k_local, Omega, base, scale * dynamic_scale)
                            if val is None or not np.isfinite(val):
                                continue
                            diff = abs(val - abs(value))
                            rel_diff = diff / max(abs(value), 1e-30)
                            if rel_diff < 0.5:
                                candidates.append((diff, n, beta, dynamic_scale, r_local, k_local))
        if not candidates:
            logging.error(f"invert_D: No valid candidates for value {value}")
            return None, None, None, None, None, None
        candidates = sorted(candidates, key=lambda x: x[0])[:5]
        valid_vals = [D(n, beta, r, k, Omega, base, scale * s) 
                      for _, n, beta, s, r, k in candidates if D(n, beta, r, k, Omega, base, scale * s) is not None]
        if not valid_vals:
            return None, None, None, None, None, None
        emergent_uncertainty = np.std(valid_vals) if len(valid_vals) > 1 else abs(valid_vals[0]) * 0.01
        if not np.isfinite(emergent_uncertainty):
            logging.error(f"invert_D: Non-finite emergent uncertainty for value {value}")
            return None, None, None, None, None, None
        best = candidates[0]
        return best[1], best[2], best[3], emergent_uncertainty, best[4], best[5]
    except Exception as e:
        logging.error(f"invert_D failed for value {value}: {e}")
        return None, None, None, None, None, None

def parse_categorized_codata(filename):
    """
    Parse the categorized CODATA file with columns: Quantity, Value, Uncertainty, Unit, Category.
    Returns a DataFrame with columns: name, value, uncertainty, unit, category.
    """
    try:
        # Read tab-separated file, assuming header is present
        df = pd.read_csv(filename, sep='\t', header=0,
                         names=['name', 'value', 'uncertainty', 'unit', 'category'],
                         dtype={'name': str, 'value': float, 'uncertainty': float, 'unit': str, 'category': str},
                         na_values=['exact'])
        # Convert 'exact' uncertainties to 0.0
        df['uncertainty'] = df['uncertainty'].fillna(0.0)
        # Ensure all required columns are present
        required_columns = ['name', 'value', 'uncertainty', 'unit']
        if not all(col in df.columns for col in required_columns):
            missing = [col for col in required_columns if col not in df.columns]
            raise ValueError(f"Missing required columns in {filename}: {missing}")
        logging.info(f"Successfully parsed {len(df)} constants from {filename}")
        return df
    except FileNotFoundError:
        logging.error(f"Input file {filename} not found")
        raise
    except Exception as e:
        logging.error(f"Error parsing {filename}: {e}")
        raise

def generate_emergent_constants(n_max=1000, beta_steps=20, r_values=[0.5, 1.0, 2.0, 5.0], k_values=[0.5, 1.0, 2.0, 5.0], Omega=1.0, base=2, scale=1.0):
    candidates = []
    n_values = np.linspace(0, n_max, 200)  # Higher resolution
    beta_values = np.linspace(0, 1, beta_steps)
    for n in tqdm(n_values, desc="Generating emergent constants"):
        for beta in beta_values:
            for r in r_values:
                for k in k_values:
                    val = D(n, beta, r, k, Omega, base, scale)
                    if val is not None and np.isfinite(val):
                        candidates.append({
                            'n': n, 'beta': beta, 'value': val, 'r': r, 'k': k, 'scale': scale
                        })
    return pd.DataFrame(candidates)

def match_to_codata(df_emergent, df_codata, tolerance=0.01):
    matches = []
    for _, codata_row in df_codata.iterrows():
        value = codata_row['value']
        mask = abs(df_emergent['value'] - value) / max(abs(value), 1e-30) < tolerance
        matched = df_emergent[mask]
        for _, emergent_row in matched.iterrows():
            error = abs(emergent_row['value'] - value)
            rel_error = error / max(abs(value), 1e-30)
            matches.append({
                'name': codata_row['name'],
                'codata_value': value,
                'emergent_value': emergent_row['value'],
                'n': emergent_row['n'],
                'beta': emergent_row['beta'],
                'r': emergent_row['r'],
                ' Juno                'k': emergent_row['k'],
                'scale': emergent_row['scale'],
                'error': error,
               %                'rel_error': rel_error,
                'codata_uncertainty': codata_row['uncertainty'],
                'bad_data': rel_error > 0.5 or (codata_row['uncertainty'] is not None and abs(codata_row['uncertainty'] - error) > 10 * codata_row['uncertainty']),
                'bad_data_reason': f"High rel_error ({rel_error:.2e})" if rel_error > 0.5 else f"Uncertainty deviation ({codata_row['uncertainty']:.2e} vs. {error:.2e})" if (codata_row['uncertainty'] is not None and abs(codata_row['uncertainty'] - error) > 10 * codata_row['uncertainty']) else ""
            })
    return pd.DataFrame(matches)

def check_physical_consistency(df_results):
    bad_data = []
    relations = [
        ('Planck constant', 'reduced Planck constant', lambda x, y: abs(x['scale'] / y['scale'] - 2 * np.pi), 0.1, 'scale ratio vs. 2π'),
        ('proton mass', 'proton-electron mass ratio', lambda x, y: abs(x['n'] - y['n'] - np.log10(1836)), 0.5, 'n difference vs. log(proton-electron ratio)'),
        ('Fermi coupling constant', 'weak mixing angle', lambda x, y: abs(x['scale'] - y['scale'] / np.sqrt(2)), 0.1, 'scale vs. sin²θ_W/√2'),
        ('tau energy equivalent', 'tau mass energy equivalent in MeV', lambda x, y: abs(x['codata_value'] - y['codata_value']), 0.01, 'value consistency'),
        ('proton mass', 'electron mass', 'proton-electron mass ratio', 
         lambda x, y, z: abs(z['n'] - abs(x['n'] - y['n'])), 10.0, 'n inconsistency for mass ratio'),
        ('fine-structure constant', 'elementary charge', 'Planck constant', 
         lambda x, y, z: abs(x['codata_value'] - y['codata_value']**2 / (4 * np.pi * 8.854187817e-12 * z['codata_value'] * 299792458)), 0.01, 'fine-structure vs. e²/(4πε₀hc)'),
        ('Bohr magneton', 'elementary charge', 'Planck constant', 
         lambda x, y, z: abs(x['codata_value'] - y['codata_value'] * z['codata_value'] / (2 * 9.1093837e-31)), 0.01, 'Bohr magneton vs. eh/(2m_e)')
    ]
    for relation in relations:
        try:
            if len(relation) == 5:  # Pairwise checks
                name1, name2, check_func, threshold, reason = relation
                if name1 in df_results['name'].values and name2 in df_results['name'].values:
                    row1 = df_results[df_results['name'] == name1].iloc[0]
                    row2 = df_results[df_results['name'] == name2].iloc[0]
                    if check_func(row1, row2) > threshold:
                        bad_data.append((name1, f"Physical inconsistency: {reason}"))
                        bad_data.append((name2, f"Physical inconsistency: {reason}"))
            elif len(relation) == 6:  # Ternary checks
                name1, name2, name3, check_func, threshold, reason = relation
                if all(name in df_results['name'].values for name in [name1, name2, name3]):
                    row1 = df_results[df_results['name'] == name1].iloc[0]
                    row2 = df_results[df_results['name'] == name2].iloc[0]
                    row3 = df_results[df_results['name'] == name3].iloc[0]
                    if check_func(row1, row2, row3) > threshold:
                        bad_data.append((name3, f"Physical inconsistency: {reason}"))
        except Exception as e:
            logging.warning(f"Physical consistency check failed for {relation}: {e}")
            continue
    return bad_data

def total_error(params, df_subset):
    r, k, Omega, base, scale = params
    df_results = symbolic_fit_all_constants(df_subset, base=base, Omega=Omega, r=r, k=k, scale=scale)
    if df_results.empty:
        return np.inf
    valid_errors = df_results['error'].dropna()
    return valid_errors.mean() if not valid_errors.empty else np.inf

def process_constant(row, r, k, Omega, base, scale):
    try:
        name, value, uncertainty, unit = row['name'], row['value'], row['uncertainty'], row['unit']
        abs_value = abs(value)
        sign = np.sign(value)
        result = invert_D(abs_value, r=r, k=k, Omega=Omega, base=base, scale=scale)
        if result[0] is None:
            return {
                'name': name, 'codata_value': value, 'unit': unit, 'n': None, 'beta': None, 'emergent_value': None,
                'error': None, 'rel_error': None, 'codata_uncertainty': uncertainty, 'emergent_uncertainty': None,
                'scale': None, 'bad_data': True, 'bad_data_reason': 'No valid fit found', 'r': None, 'k': None
            }
        n, beta, dynamic_scale, emergent_uncertainty, r_local, k_local = result
        approx = D(n, beta, r_local, k_local, Omega, base, scale * dynamic_scale)
        if approx is None:
            return {
                'name': name, 'codata_value': value, 'unit': unit, 'n': None, 'beta': None, 'emergent_value': None,
                'error': None, 'rel_error': None, 'codata_uncertainty': uncertainty, 'emergent_uncertainty': None,
                'scale': None, 'bad_data': True, 'bad_data_reason': 'D function returned None', 'r': None, 'k': None
            }
        approx *= sign
        error = abs(approx - value)
        rel_error = error / max(abs(value), 1e-30) if abs(value) > 0 else np.inf
        bad_data = False
        bad_data_reason = ""
        if rel_error > 0.5:
            bad_data = True
            bad_data_reason += f"High relative error ({rel_error:.2e} > 0.5); "
        if emergent_uncertainty is not None and uncertainty is not None:
            if emergent_uncertainty > uncertainty * 20 or emergent_uncertainty < uncertainty / 20:
                bad_data = True
                bad_data_reason += f"Uncertainty deviates from emergent ({emergent_uncertainty:.2e} vs. {uncertainty:.2e}); "
        return {
            'name': name, 'codata_value': value, 'unit': unit, 'n': n, 'beta': beta, 'emergent_value': approx,
            'error': error, 'rel_error': rel_error, 'codata_uncertainty': uncertainty, 
            'emergent_uncertainty': emergent_uncertainty, 'scale': scale * dynamic_scale,
            'bad_data': bad_data, 'bad_data_reason': bad_data_reason, 'r': r_local, 'k': k_local
        }
    except Exception as e:
        logging.error(f"process_constant failed for {row['name']}: {e}")
        return {
            'name': row['name'], 'codata_value': row['value'], 'unit': row['unit'], 'n': None, 'beta': None, 
            'emergent_value': None, 'error': None, 'rel_error': None, 'codata_uncertainty': row['uncertainty'], 
            'emergent_uncertainty': None, 'scale': None, 'bad_data': True, 'bad_data_reason': f"Processing error: {str(e)}",
            'r': None, 'k': None
        }

def symbolic_fit_all_constants(df, base=2, Omega=1.0, r=1.0, k=1.0, scale=1.0):
    logging.info("Starting symbolic fit for all constants...")
    results = Parallel(n_jobs=-1, timeout=30, backend='loky', maxtasksperchild=100)(
        delayed(process_constant)(row, r, k, Omega, base, scale) 
        for row in tqdm(df.to_dict('records'), desc="Fitting constants")
    )
    results = [r for r in results if r is not None]
    df_results = pd.DataFrame(results)

    if not df_results.empty:
        df_results['bad_data'] = df_results.get('bad_data', False)
        df_results['bad_data_reason'] = df_results.get('bad_data_reason', '')
        for name in df_results['name'].unique():
            mask = df_results['name'] == name
            if df_results.loc[mask, 'codata_uncertainty'].notnull().any():
                uncertainties = df_results.loc[mask, 'codata_uncertainty'].dropna()
                if not uncertainties.empty:
                    Q1, Q3 = np.percentile(uncertainties, [25, 75])
                    IQR = Q3 - Q1
                    outlier_mask = (uncertainties < Q1 - 1.5 * IQR) | (uncertainties > Q3 + 1.5 * IQR)
                    if outlier_mask.any():
                        df_results.loc[mask & df_results['codata_uncertainty'].isin(uncertainties[outlier_mask]), 'bad_data'] = True
                        df_results.loc[mask & df_results['codata_uncertainty'].isin(uncertainties[outlier_mask]), 'bad_data_reason'] += 'Uncertainty outlier; '

        high_rel_error_mask = df_results['rel_error'] > 0.5
        df_results.loc[high_rel_error_mask, 'bad_data'] = True
        df_results.loc[high_rel_error_mask, 'bad_data_reason'] += df_results.loc[high_rel_error_mask, 'rel_error'].apply(lambda x: f"High relative error ({x:.2e} > 0.5); ")

        high_uncertainty_mask = (df_results['emergent_uncertainty'].notnull()) & (
            (df_results['codata_uncertainty'] > 20 * df_results['emergent_uncertainty']) | 
            (df_results['codata_uncertainty'] < 0.05 * df_results['emergent_uncertainty'])
        )
        df_results.loc[high_uncertainty_mask, 'bad_data'] = True
        df_results.loc[high_uncertainty_mask, 'bad_data_reason'] += df_results.loc[high_uncertainty_mask].apply(
            lambda row: f"Uncertainty deviates from emergent ({row['codata_uncertainty']:.2e} vs. {row['emergent_uncertainty']:.2e}); ", axis=1)

        bad_data = check_physical_consistency(df_results)
        for name, reason in bad_data:
            df_results.loc[df_results['name'] == name, 'bad_data'] = True
            df_results.loc[df_results['name'] == name, 'bad_data_reason'] += reason + '; '

    logging.info("Symbolic fit completed.")
    return df_results

def main():
    start_time = time.time()
    input_file = "categorized_allascii.txt"
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"{input_file} not found in the current directory")
    df = parse_categorized_codata(input_file)
    logging.info(f"Parsed {len(df)} constants")

    # Generate emergent constants
    print("Generating emergent constants...")
    emergent_df = generate_emergent_constants(n_max=1000, beta_steps=20)
    matched_df = match_to_codata(emergent_df, df, tolerance=0.01)
    matched_df.to_csv("emergent_constants.txt", sep="\t", index=False)
    logging.info("Saved emergent constants to emergent_constants.txt")

    # Optimize parameters on worst-performing constants
    worst_names = [
        'muon mag. mom. to Bohr magneton ratio', 'electron-deuteron mag. mom. ratio',
        'proton-electron mass ratio', 'neutron-electron mass ratio',
        'electron mag. mom.', 'neutron mag. mom.', 'alpha particle mass energy equivalent in MeV'
    ]
    subset_df = df[df['name'].isin(worst_names)]
    if subset_df.empty:
        subset_df = df.head(50)
    init_params = [1.0, 1.0, 1.0, 2.0, 1.0]  # r, k, Omega, base, scale
    bounds = [(1e-5, 10), (1e-5, 10), (1e-5, 10), (1.5, 10), (1e-5, 100)]
    
    print("Optimizing symbolic model parameters for worst fits...")
    try:
        res = minimize(total_error, init_params, args=(subset_df,), bounds=bounds, method='L-BFGS-B', options={'maxiter': 50, 'disp': True})
        if not res.success:
            logging.warning(f"Optimization failed: {res.message}")
            r_opt, k_opt, Omega_opt, base_opt, scale_opt = init_params
        else:
            r_opt, k_opt, Omega_opt, base_opt, scale_opt = res.x
        print(f"Optimization complete. Found parameters:\nr = {r_opt:.6f}, k = {k_opt:.6f}, Omega = {Omega_opt:.6f}, base = {base_opt:.6f}, scale = {scale_opt:.6f}")
    except Exception as e:
        logging.error(f"Optimization failed: {e}")
        r_opt, k_opt, Omega_opt, base_opt, scale_opt = init_params
        print(f"Optimization failed: {e}. Using default parameters.")

    # Run final fit
    df_results = symbolic_fit_all_constants(df, base=base_opt, Omega=Omega_opt, r=r_opt, k=k_opt, scale=scale_opt)
    if not df_results.empty:
        df_results.to_csv("symbolic_fit_results_emergent_fixed.txt", sep="\t", index=False)
        logging.info(f"Saved results to symbolic_fit_results_emergent_fixed.txt")
    else:
        logging.error("No results to save")

    logging.info(f"Total runtime: {time.time() - start_time:.2f} seconds")

    # Display results
    df_results_sorted = df_results.sort_values("error", na_position='last')
    print("\nTop 20 best symbolic fits:")
    print(df_results_sorted.head(20)[['name', 'codata_value', 'unit', 'n', 'beta', 'emergent_value', 'error', 'codata_uncertainty', 'scale', 'bad_data', 'bad_data_reason']].to_string(index=False))

    print("\nTop 20 worst symbolic fits:")
    print(df_results_sorted.tail(20)[['name', 'codata_value', 'unit', 'n', 'beta', 'emergent_value', 'error', 'codata_uncertainty', 'scale', 'bad_data', 'bad_data_reason']].to_string(index=False))

    print("\nPotentially bad data constants summary (possible cheated data):")
    bad_data_df = df_results[df_results['bad_data'] == True][['name', 'codata_value', 'error', 'rel_error', 'codata_uncertainty', 'emergent_uncertainty', 'bad_data_reason']]
    bad_data_df = bad_data_df.sort_values('rel_error', ascending=False, na_position='last')
    print(bad_data_df.to_string(index=False))

    print("\nTop 20 emergent constants matches:")
    matched_df_sorted = matched_df.sort_values('error', na_position='last')
    print(matched_df_sorted.head(20)[['name', 'codata_value', 'emergent_value', 'n', 'beta', 'error', 'rel_error', 'codata_uncertainty', 'bad_data', 'bad_data_reason']].to_string(index=False))

    df_results_sorted.to_csv("symbolic_fit_results.txt", sep="\t", index=False)

    # Plotting
    plt.figure(figsize=(10, 5))
    plt.hist(df_results_sorted['error'].dropna(), bins=50, color='skyblue', edgecolor='black')
    plt.title('Histogram of Absolute Errors in Symbolic Fit')
    plt.xlabel('Absolute Error')
    plt.ylabel('Count')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('histogram_errors.png')
    plt.close()

    plt.figure(figsize=(10, 5))
    plt.scatter(df_results_sorted['n'], df_results_sorted['error'], alpha=0.5, s=15, c='orange', edgecolors='black')
    plt.title('Absolute Error vs Symbolic Dimension n (Fitted)')
    plt.xlabel('n')
    plt.ylabel('Absolute Error')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('scatter_n_error.png')
    plt.close()

    plt.figure(figsize=(10, 5))
    worst_fits = df_results_sorted.tail(20)
    plt.bar(worst_fits['name'], worst_fits['error'], color='salmon', edgecolor='black')
    plt.xticks(rotation=90)
    plt.title('Absolute Errors for Top 20 Worst Symbolic Fits')
    plt.xlabel('Constant Name')
    plt.ylabel('Absolute Error')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('bar_worst_fits.png')
    plt.close()

    plt.figure(figsize=(10, 5))
    plt.scatter(matched_df_sorted['codata_value'], matched_df_sorted['emergent_value'], alpha=0.5, s=15, c='purple', edgecolors='black')
    plt.plot([matched_df_sorted['codata_value'].min(), matched_df_sorted['codata_value'].max()], 
             [matched_df_sorted['codata_value'].min(), matched_df_sorted['codata_value'].max()], 'k--')
    plt.title('Emergent Constants vs. CODATA Values')
    plt.xlabel('CODATA Value')
    plt.ylabel('Emergent Value')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('scatter_codata_emergent.png')
    plt.close()

if __name__ == "__main__":
    main()