import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# --- Configuration for loading the data ---
file_path = "symbolic_fit_results_emergent.txt"

# --- Load the data ---
try:
    # Assuming the file is tab-separated as per your original code's to_csv
    df_results = pd.read_csv(file_path, sep="\t")
    print(f"Successfully loaded {len(df_results)} constants from {file_path}.")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please ensure it's in the same directory as this script.")
    exit()
except Exception as e:
    print(f"An error occurred while loading the file: {e}")
    exit()

# --- Prepare data for plotting and outlier detection ---
# Ensure 'error' and 'n' columns are numeric.
df_results['error'] = pd.to_numeric(df_results['error'], errors='coerce')
df_results['n'] = pd.to_numeric(df_results['n'], errors='coerce')

# Drop rows where 'error' or 'n' might be NaN after conversion (e.g., from failed inversions)
df_results_cleaned = df_results.dropna(subset=['error', 'n'])

if df_results_cleaned.empty:
    print("No valid data available for plotting after cleaning. Check your input file.")
else:
    # --- Identify Outliers using IQR method on 'error' ---
    Q1 = df_results_cleaned['error'].quantile(0.25)
    Q3 = df_results_cleaned['error'].quantile(0.75)
    IQR = Q3 - Q1

    # Define outlier bounds (1.5 * IQR rule)
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out the outliers to create a new DataFrame for plotting
    df_no_outliers = df_results_cleaned[(df_results_cleaned['error'] >= lower_bound) & (df_results_cleaned['error'] <= upper_bound)]

    print(f"Original data points: {len(df_results_cleaned)}")
    print(f"Outliers identified and removed: {len(df_results_cleaned) - len(df_no_outliers)}")
    print(f"Data points for plotting (without outliers): {len(df_no_outliers)}")

    if df_no_outliers.empty:
        print("No data points remain for plotting after removing outliers.")
    else:
        # Sort for consistent plotting, as in the original code
        df_no_outliers_sorted = df_no_outliers.sort_values("error")

        # --- Regenerate Histogram of Absolute Errors (without outliers) ---
        plt.figure(figsize=(10, 5))
        plt.hist(df_no_outliers_sorted['error'], bins=50, color='skyblue', edgecolor='black')
        plt.title('Histogram of Absolute Errors in Symbolic Fit (Outliers Removed)')
        plt.xlabel('Absolute Error')
        plt.ylabel('Count')
        plt.grid(True)
        plt.tight_layout()
        plt.show()

        # --- Regenerate Scatter Plot of Absolute Error vs Symbolic Dimension n (without outliers) ---
        plt.figure(figsize=(10, 5))
        plt.scatter(df_no_outliers_sorted['n'], df_no_outliers_sorted['error'], alpha=0.5, s=15, c='orange', edgecolors='black')
        plt.title('Absolute Error vs Symbolic Dimension n (Outliers Removed)')
        plt.xlabel('n')
        plt.ylabel('Absolute Error')
        plt.grid(True)
        plt.tight_layout()
        plt.show()

        print("\nGraphs regenerated successfully, excluding outliers!")