import os
import sys
from pathlib import Path

sys.path.insert(0, os.path.abspath('..'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

from src.real_benchmark import DATASET_SPECS, FEATURE_COLUMNS, prepare_sequence_dataset
from src.evaluate import (
    classification_metrics,
    conformal_margin,
    forecast_metrics,
    plot_anomaly_scores,
    plot_attention_heatmap,
    plot_forecast,
    plot_model_comparison,
    run_mc_dropout,
)

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 4)
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.right'] = False

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print({'device': str(device), 'torch': torch.__version__})

OUTPUT_DIR = Path('../outputs')
OUTPUT_DIR.mkdir(exist_ok=True)

from src.models import GRUForecaster, LSTMForecaster, TransformerForecaster

DATASETS = ['machine_temperature_system_failure', 'ec2_cpu_utilization_24ae8d', 'nyc_taxi']
SEQ_LEN = 48
HORIZON = 12
EPOCHS = 3
BATCH_SIZE = 256
ALPHA = 0.8
MAX_TRAIN_WINDOWS = 6000


def make_loader(X, yf, yl, shuffle=False):
    return DataLoader(
        TensorDataset(
            torch.tensor(X, dtype=torch.float32),
            torch.tensor(yf, dtype=torch.float32),
            torch.tensor(yl, dtype=torch.float32),
        ),
        batch_size=BATCH_SIZE,
        shuffle=shuffle,
    )


def maybe_cap_split(X, y, l, max_windows=MAX_TRAIN_WINDOWS):
    if len(X) <= max_windows:
        return X, y, l
    return X[-max_windows:], y[-max_windows:], l[-max_windows:]


benchmark_rows = []
for dataset_name in DATASETS:
    bundle = prepare_sequence_dataset(dataset_name, seq_len=SEQ_LEN, horizon=HORIZON)
    Xtr, ytr, ltr = maybe_cap_split(*bundle['train'])
    Xv, yv, lv = bundle['val']
    Xte, yte, lte = bundle['test']
    benchmark_rows.append({
        'dataset': bundle['display_name'],
        'application': bundle['application'],
        'train_windows': len(Xtr),
        'validation_windows': len(Xv),
        'test_windows': len(Xte),
        'incident_fraction_test': float(lte.mean()),
    })

benchmark_overview = pd.DataFrame(benchmark_rows)
benchmark_overview

{'device': 'cpu', 'torch': '2.10.0'}

fig, axes = plt.subplots(len(DATASETS), 1, figsize=(14, 9), sharex=False)
for ax, dataset_name in zip(np.atleast_1d(axes), DATASETS):
    frame = prepare_sequence_dataset(dataset_name, seq_len=SEQ_LEN, horizon=HORIZON)['frame']
    sample = frame.iloc[: min(len(frame), 1500)]
    ax.plot(sample['timestamp'], sample['value'], linewidth=0.9, color='#1f77b4')
    ax.fill_between(
        sample['timestamp'],
        sample['value'].min(),
        sample['value'].max(),
        where=sample['label'].astype(bool),
        color='#d62728',
        alpha=0.18,
    )
    ax.set_title(DATASET_SPECS[dataset_name]['display_name'])
    ax.set_ylabel('Value')
plt.tight_layout()
plt.show()

def train_model(model, train_loader, val_loader, labels_train, epochs=EPOCHS, alpha=ALPHA, lr=1e-3):

    model = model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    mse_loss = nn.MSELoss()

    pos_weight = torch.tensor([(len(labels_train) - labels_train.sum()) / max(labels_train.sum(), 1.0)], device=device)

    bce_loss = nn.BCEWithLogitsLoss(pos_weight=pos_weight)



    for _ in range(epochs):

        model.train()

        for xb, yb, lb in train_loader:

            xb = xb.to(device)

            yb = yb.to(device)

            lb = lb.to(device)

            optimizer.zero_grad()

            forecast, drift_logit = model(xb)

            forecast_loss = mse_loss(forecast, yb)

            cls_loss = bce_loss(drift_logit.squeeze(-1), lb)

            loss = alpha * forecast_loss + (1 - alpha) * cls_loss

            loss.backward()

            optimizer.step()

    return model



@torch.no_grad()

def evaluate_model(model, loader, y_true, inverse_target):

    model.eval()

    forecasts, logits, labels = [], [], []

    for xb, yb, lb in loader:

        xb = xb.to(device)

        forecast, drift_logit = model(xb)

        forecasts.append(forecast.cpu().numpy())

        logits.append(drift_logit.cpu().numpy().reshape(-1))

        labels.append(lb.numpy())

    forecasts = np.vstack(forecasts)

    logits = np.concatenate(logits)

    labels = np.concatenate(labels)

    metrics = forecast_metrics(y_true, inverse_target(forecasts))

    metrics.update(classification_metrics(labels, logits))

    return metrics



benchmark_results = []

for dataset_name in DATASETS:

    bundle = prepare_sequence_dataset(dataset_name, seq_len=SEQ_LEN, horizon=HORIZON)

    Xtr, ytr, ltr = maybe_cap_split(*bundle['train'])

    Xv, yv, lv = bundle['val']

    Xte, yte, lte = bundle['test']

    target_min = float(bundle['x_min'][0])

    target_range = max(float(bundle['x_max'][0] - bundle['x_min'][0]), 1e-6)



    def scale_target(values, target_min=target_min, target_range=target_range):

        return (values - target_min) / target_range



    def inverse_target(values, target_min=target_min, target_range=target_range):

        return values * target_range + target_min



    ytr_scaled = scale_target(ytr)

    yv_scaled = scale_target(yv)

    yte_scaled = scale_target(yte)



    train_loader = make_loader(Xtr, ytr_scaled, ltr, shuffle=True)

    val_loader = make_loader(Xv, yv_scaled, lv)

    test_loader = make_loader(Xte, yte_scaled, lte)

    input_dim = Xtr.shape[-1]



    candidates = {

        'LSTM': LSTMForecaster(input_dim=input_dim, hidden_dim=96, num_layers=2, horizon=HORIZON, dropout=0.2),

        'GRU': GRUForecaster(input_dim=input_dim, hidden_dim=96, num_layers=2, horizon=HORIZON, dropout=0.2),

        'Transformer': TransformerForecaster(input_dim=input_dim, d_model=96, nhead=4, num_layers=2, dim_ff=192, horizon=HORIZON, dropout=0.1),

    }



    for model_name, model in candidates.items():

        trained = train_model(model, train_loader, val_loader, ltr)

        metrics = evaluate_model(trained, test_loader, yte, inverse_target)

        metrics['dataset'] = bundle['display_name']

        metrics['application'] = bundle['application']

        metrics['model'] = model_name

        benchmark_results.append(metrics)



benchmark_results_df = pd.DataFrame(benchmark_results)

benchmark_results_df.sort_values(['dataset', 'MAE'])

/Users/mohuyn/Library/CloudStorage/OneDrive-SAS/Documents/GitHub/Quantum-Drift-Forecasting/src/models.py:204: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
  self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)

/Users/mohuyn/Library/CloudStorage/OneDrive-SAS/Documents/GitHub/Quantum-Drift-Forecasting/src/models.py:204: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
  self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)

/Users/mohuyn/Library/CloudStorage/OneDrive-SAS/Documents/GitHub/Quantum-Drift-Forecasting/src/models.py:204: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.norm_first was True
  self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)

mae_table = benchmark_results_df.pivot(index='dataset', columns='model', values='MAE')
f1_table = benchmark_results_df.pivot(index='dataset', columns='model', values='F1')

fig, axes = plt.subplots(1, 2, figsize=(14, 4))
mae_im = axes[0].imshow(mae_table.values, cmap='viridis_r', aspect='auto')
axes[0].set_title('MAE across datasets and models')
axes[0].set_xticks(range(len(mae_table.columns)), mae_table.columns, rotation=25)
axes[0].set_yticks(range(len(mae_table.index)), mae_table.index)
for i in range(mae_table.shape[0]):
    for j in range(mae_table.shape[1]):
        axes[0].text(j, i, f'{mae_table.values[i, j]:.2f}', ha='center', va='center', color='white')
fig.colorbar(mae_im, ax=axes[0], fraction=0.046, pad=0.04)

f1_im = axes[1].imshow(f1_table.values, cmap='magma', aspect='auto')
axes[1].set_title('Incident F1 across datasets and models')
axes[1].set_xticks(range(len(f1_table.columns)), f1_table.columns, rotation=25)
axes[1].set_yticks(range(len(f1_table.index)), f1_table.index)
for i in range(f1_table.shape[0]):
    for j in range(f1_table.shape[1]):
        axes[1].text(j, i, f'{f1_table.values[i, j]:.2f}', ha='center', va='center', color='white')
fig.colorbar(f1_im, ax=axes[1], fraction=0.046, pad=0.04)
plt.tight_layout()
plt.show()

aggregate_rank = benchmark_results_df.groupby('model')[['MAE', 'F1', 'ROC-AUC']].mean().sort_values('MAE')
aggregate_rank

summary = benchmark_results_df.groupby('model').agg(
    mean_mae=('MAE', 'mean'),
    mean_rmse=('RMSE', 'mean'),
    mean_f1=('F1', 'mean'),
    mean_auc=('ROC-AUC', 'mean'),
).sort_values('mean_mae')

summary.plot(kind='bar', subplots=True, layout=(2, 2), figsize=(13, 7), legend=False, sharex=True)
plt.tight_layout()
plt.show()

summary

ranked_results = benchmark_results_df.copy()
ranked_results['mae_rank'] = ranked_results.groupby('dataset')['MAE'].rank(method='dense', ascending=True)
ranked_results['f1_rank'] = ranked_results.groupby('dataset')['F1'].rank(method='dense', ascending=False)
ranked_results['auc_rank'] = ranked_results.groupby('dataset')['ROC-AUC'].rank(method='dense', ascending=False)

rank_summary = ranked_results.groupby('model')[['mae_rank', 'f1_rank', 'auc_rank']].mean().reset_index()
winner_rows = []
for dataset_name, dataset_frame in benchmark_results_df.groupby('dataset'):
    mae_order = dataset_frame.sort_values('MAE').reset_index(drop=True)
    f1_order = dataset_frame.sort_values('F1', ascending=False).reset_index(drop=True)
    winner_rows.append(
        {
            'dataset': dataset_name,
            'mae_winner': mae_order.loc[0, 'model'],
            'mae_margin_to_runner_up': mae_order.loc[1, 'MAE'] - mae_order.loc[0, 'MAE'],
            'f1_winner': f1_order.loc[0, 'model'],
            'f1_margin_to_runner_up': f1_order.loc[0, 'F1'] - f1_order.loc[1, 'F1'],
        }
    )
winner_table = pd.DataFrame(winner_rows)
winner_counts = (
    pd.concat([winner_table['mae_winner'], winner_table['f1_winner']])
    .value_counts()
    .reindex(sorted(benchmark_results_df['model'].unique()), fill_value=0)
    .rename('cross_dataset_wins')
)

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

rank_positions = np.arange(len(rank_summary))
width = 0.24
for offset, column, color, label in [
    (-width, 'mae_rank', '#0984e3', 'MAE rank'),
    (0.0, 'f1_rank', '#00b894', 'F1 rank'),
    (width, 'auc_rank', '#6c5ce7', 'ROC-AUC rank'),
]:
    axes[0, 0].bar(rank_positions + offset, rank_summary[column], width=width, color=color, label=label)
axes[0, 0].set_xticks(rank_positions, rank_summary['model'])
axes[0, 0].set_title('Mean rank across datasets and metrics')
axes[0, 0].set_ylabel('Average rank (lower is better)')
axes[0, 0].legend()

scatter = axes[0, 1].scatter(
    benchmark_results_df['MAE'],
    benchmark_results_df['F1'],
    c=benchmark_results_df['ROC-AUC'],
    cmap='viridis',
    s=180,
    alpha=0.9,
    edgecolor='black',
)
for _, row in benchmark_results_df.iterrows():
    axes[0, 1].annotate(
        f"{row['dataset'].split()[0]}-{row['model']}",
        (row['MAE'], row['F1']),
        textcoords='offset points',
        xytext=(6, 4),
        fontsize=8,
    )
axes[0, 1].set_title('Joint forecasting and incident-detection frontier')
axes[0, 1].set_xlabel('MAE')
axes[0, 1].set_ylabel('F1')
fig.colorbar(scatter, ax=axes[0, 1], fraction=0.046, pad=0.04, label='ROC-AUC')

axes[1, 0].bar(winner_counts.index, winner_counts.values, color=['#00b894', '#0984e3', '#6c5ce7'])
axes[1, 0].set_title('Cross-dataset win count (MAE and F1 leaders)')
axes[1, 0].set_ylabel('Number of wins')

margin_positions = np.arange(len(winner_table))
axes[1, 1].bar(
    margin_positions - 0.18,
    winner_table['mae_margin_to_runner_up'],
    width=0.36,
    color='#74b9ff',
    label='MAE margin',
)
axes[1, 1].bar(
    margin_positions + 0.18,
    winner_table['f1_margin_to_runner_up'],
    width=0.36,
    color='#ff7675',
    label='F1 margin',
)
axes[1, 1].set_xticks(margin_positions, winner_table['dataset'], rotation=15)
axes[1, 1].set_title('Leader margin over runner-up by dataset')
axes[1, 1].set_ylabel('Metric margin')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

winner_table

# ── Additional Figure: Per-Dataset, Per-Metric Model Comparison ─────────────
# Shows exactly which model wins on which metric on which dataset,
# and by how much. This is the evidence behind the "no single winner" claim.
metrics_to_show = ["MAE", "RMSE", "F1", "ROC-AUC"]
directions       = ["↓ lower", "↓ lower", "↑ higher", "↑ higher"]
datasets  = benchmark_results_df["dataset"].unique()
models_list = benchmark_results_df["model"].unique()
colors_map = {"GRU": "#10b981", "LSTM": "#f59e0b", "VanillaRNN": "#94a3b8", "Transformer": "#6366f1"}

fig, axes = plt.subplots(len(metrics_to_show), len(datasets),
                         figsize=(14, 4 * len(metrics_to_show)), sharey="row")

for row_idx, (metric, direction) in enumerate(zip(metrics_to_show, directions)):
    for col_idx, ds in enumerate(datasets):
        ax = axes[row_idx, col_idx]
        subset = benchmark_results_df[benchmark_results_df["dataset"] == ds].copy()
        vals = []
        labs = []
        cols = []
        for m in models_list:
            row = subset[subset["model"] == m]
            if not row.empty:
                vals.append(float(row[metric].iloc[0]))
                labs.append(m)
                cols.append(colors_map.get(m, "#666"))
        bars = ax.bar(labs, vals, color=cols, width=0.65)
        for bar, val in zip(bars, vals):
            ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() * 1.01,
                    f"{val:.3f}", ha="center", va="bottom", fontsize=8)
        ax.set_title(f"{ds[:20]}\n{metric} ({direction} = better)", fontsize=9)
        ax.tick_params(axis="x", rotation=30, labelsize=8)
        if col_idx == 0:
            ax.set_ylabel(metric)

plt.suptitle(
    "Figure 5. Per-Dataset, Per-Metric Model Comparison\n"
    "Shows exactly which model leads on which metric for each of the three temporal regimes — "
    "the visual evidence that model preference depends on both dataset and objective",
    fontweight="bold", fontsize=11
)
plt.tight_layout()
plt.show()

Model	Mean MAE ↓	Mean RMSE ↓	Mean F1 ↑	Mean ROC-AUC ↑	Leads on
GRU	1337.33	1628.83	0.0942	0.6603	Forecast accuracy + anomaly ranking
Transformer	1436.25	1791.61	0.0530	0.1955	Periodic calibration signals only
LSTM	1528.83	1895.01	0.1057	0.6278	Incident detection frequency

Dataset	Technical Setting	Relevance to Quantum Hardware
Machine Temperature	Equipment thermal health monitoring	Thermal drift directly drives decoherence and gate error escalation in quantum systems.
EC2 CPU Utilization	Cloud compute utilization monitoring	Periodic calibration signal variability and anomaly detection against structured operational backgrounds.
NYC Taxi	High-volume demand forecasting	Uncertainty quantification and anomaly localization in time-dependent high-dimensional operational data.

Model	Mean MAE ↓	Mean RMSE ↓	Mean F1 ↑	Mean ROC-AUC ↑
GRU	1337.33	1628.83	0.0942	0.6603
Transformer	1436.25	1791.61	0.0530	0.1955
LSTM	1528.83	1895.01	0.1057	0.6278

Dataset	Model	MAE	RMSE	MAPE%	Precision	Recall	F1	ROC-AUC
Machine Temp.	GRU	9.459	13.491	17.97	0.000	0.000	0.000	1.000
Machine Temp.	Transformer	12.294	19.743	26.03	0.000	0.000	0.000	0.000
Machine Temp.	LSTM	12.965	19.809	26.57	0.000	0.000	0.000	1.000
EC2 CPU	LSTM	0.04655	0.13425	35.31	0.000	0.000	0.000	0.435
EC2 CPU	Transformer	0.04924	0.13531	39.10	0.000	0.000	0.000	0.183
EC2 CPU	GRU	0.05152	0.13520	40.87	0.000	0.000	0.000	0.551
NYC Taxi	GRU	4002.49	4872.85	347.83	0.200	0.482	0.283	0.430
NYC Taxi	Transformer	4296.40	5354.94	355.12	0.165	0.153	0.159	0.403
NYC Taxi	LSTM	4573.47	5665.10	466.77	0.213	0.619	0.317	0.448

Dataset	MAE Winner	MAE Margin	F1 Winner	F1 Margin
EC2 CPU Utilization	LSTM	0.002698 over GRU	LSTM	tied at 0.000
Machine Temp. Failure	GRU	2.835 over LSTM	LSTM	tied at 0.000
NYC Taxi Demand	GRU	293.907 over Transformer	LSTM	0.035 over GRU

Cross-Domain Benchmarking for Objective-Aware Architecture Selection in Quantum Hardware Monitoring¶

1. Research Objective and Technical Contribution¶

Quantum Computing Contribution¶

Primary Result¶

Why No Architecture Dominates¶

Quantum Hardware Connection¶

2. Dataset and Technical Context¶

Statistical Comparison — Cross-Domain Benchmark Results¶

Mean Performance Across All Three Datasets¶

GRU vs LSTM (cross-domain)¶

GRU vs Transformer (cross-domain)¶

Per-Dataset Detailed Breakdown¶

Dataset-Level Winner Analysis¶

Metric Guide — How to Read Every Comparison Number¶

How improvements are computed in this notebook¶

3. Experimental Protocol¶

4. Model Construction¶

5. Training Procedure¶

6. Results and Visual Evidence¶

7. Technical Interpretation¶

8. Limitations and Scope¶

9. Key Takeaways¶

Metric	↓ or ↑	What It Measures	What a Cross-Domain Lead Means
Mean MAE	↓ lower = better	Average MAE across all three datasets — aggregate forecast accuracy	A model with the lowest Mean MAE is the most accurate forecaster on average across the heterogeneous signal regimes encountered in quantum hardware monitoring
Mean RMSE	↓ lower = better	Average RMSE across all three datasets	Lower mean RMSE = fewer large prediction misses on average. A model that leads on both Mean MAE and Mean RMSE is dominant on the forecasting objective
Mean F1	↑ higher = better	Average incident-detection F1 across all three datasets	The model with the highest Mean F1 is the most effective at detecting incidents in aggregate — even if it does not lead on every individual dataset
Mean ROC-AUC	↑ higher = better	Average anomaly-ranking quality across all three datasets	The model with the highest Mean ROC-AUC provides the most reliable anomaly priority queue on average across all deployment regimes

	dataset	application	train_windows	validation_windows	test_windows	incident_fraction_test
0	Machine Temperature System Failure	data-center thermal monitoring	6000	3395	3396	0.147232
1	EC2 CPU Utilization	cloud capacity monitoring	2781	595	597	0.673367
2	NYC Taxi Demand	urban mobility demand planning	6000	1539	1540	0.237013

	MAE	RMSE	MAPE_%	Precision	Recall	F1	ROC-AUC	dataset	application	model
3	0.046546	0.134253	35.306650	0.000000	0.000000	0.000000	0.435234	EC2 CPU Utilization	cloud capacity monitoring	LSTM
5	0.049243	0.135309	39.102721	0.000000	0.000000	0.000000	0.183448	EC2 CPU Utilization	cloud capacity monitoring	Transformer
4	0.051522	0.135197	40.866959	0.000000	0.000000	0.000000	0.551244	EC2 CPU Utilization	cloud capacity monitoring	GRU
1	9.459097	13.490814	17.971821	0.000000	0.000000	0.000000	1.000000	Machine Temperature System Failure	data-center thermal monitoring	GRU
2	12.293981	19.742779	26.034215	0.000000	0.000000	0.000000	0.000000	Machine Temperature System Failure	data-center thermal monitoring	Transformer
0	12.964861	19.808985	26.571679	0.000000	0.000000	0.000000	1.000000	Machine Temperature System Failure	data-center thermal monitoring	LSTM
7	4002.487549	4872.851562	347.828960	0.199773	0.482192	0.282504	0.429521	NYC Taxi Demand	urban mobility demand planning	GRU
8	4296.395020	5354.944824	355.122519	0.165192	0.153425	0.159091	0.403020	NYC Taxi Demand	urban mobility demand planning	Transformer
6	4573.467773	5665.095215	466.774702	0.213208	0.619178	0.317193	0.448192	NYC Taxi Demand	urban mobility demand planning	LSTM

	MAE	F1	ROC-AUC
model
GRU	1337.332723	0.094168	0.660255
Transformer	1436.246081	0.053030	0.195489
LSTM	1528.826393	0.105731	0.627809

	mean_mae	mean_rmse	mean_f1	mean_auc
model
GRU	1337.332723	1628.825858	0.094168	0.660255
Transformer	1436.246081	1791.607637	0.053030	0.195489
LSTM	1528.826393	1895.012817	0.105731	0.627809

	dataset	mae_winner	mae_margin_to_runner_up	f1_winner	f1_margin_to_runner_up
0	EC2 CPU Utilization	LSTM	0.002698	LSTM	0.000000
1	Machine Temperature System Failure	GRU	2.834884	LSTM	0.000000
2	NYC Taxi Demand	GRU	293.907471	LSTM	0.034689