wip

BatyLeo · BatyLeo · commit 37b816aba7ae · 2026-03-27T12:24:31.000+01:00
diff --git a/src/DecisionFocusedLearningAlgorithms.jl b/src/DecisionFocusedLearningAlgorithms.jl
@@ -10,7 +10,6 @@ using Statistics: mean
 using UnicodePlots: lineplot
 using ValueHistories: MVHistory
 
-include("utils.jl")
 include("training_context.jl")
 
 include("metrics/interface.jl")
diff --git a/src/algorithms/supervised/anticipative_imitation.jl b/src/algorithms/supervised/anticipative_imitation.jl
@@ -31,12 +31,11 @@ function train_policy!(
     anticipative_policy,
     epochs=10,
     metrics::Tuple=(),
-    maximizer_kwargs=get_state,
+    maximizer_kwargs=sample -> sample.context,
 )
     # Generate anticipative solutions as training data
     train_dataset = vcat(map(train_environments) do env
-        v, y = anticipative_policy(env; reset_env=true)
-        return y
+        return anticipative_policy(env; reset_env=true)
     end...)
 
     # Delegate to inner algorithm
@@ -62,26 +61,22 @@ Uses anticipative solutions as expert demonstrations.
 """
 function train_policy(
     algorithm::AnticipativeImitation,
-    benchmark::AbstractStochasticBenchmark{true};
+    benchmark::ExogenousDynamicBenchmark;
     dataset_size=30,
-    split_ratio=(0.3, 0.3),
     epochs=10,
     metrics::Tuple=(),
     seed=nothing,
 )
-    # Generate instances and environments
-    dataset = generate_dataset(benchmark, dataset_size)
-    train_instances, validation_instances, _ = splitobs(dataset; at=split_ratio)
-    train_environments = generate_environments(benchmark, train_instances)
+    # Generate environments
+    train_environments = generate_environments(benchmark, dataset_size; seed)
 
     # Initialize model and create policy
     model = generate_statistical_model(benchmark; seed)
     maximizer = generate_maximizer(benchmark)
     policy = DFLPolicy(model, maximizer)
 
     # Define anticipative policy from benchmark
-    anticipative_policy =
-        (env; reset_env) -> generate_anticipative_solution(benchmark, env; reset_env)
+    anticipative_policy = generate_anticipative_solver(benchmark)
 
     # Train policy
     history = train_policy!(
diff --git a/src/algorithms/supervised/dagger.jl b/src/algorithms/supervised/dagger.jl
@@ -34,7 +34,7 @@ function train_policy!(
     train_environments;
     anticipative_policy,
     metrics::Tuple=(),
-    maximizer_kwargs=get_state,
+    maximizer_kwargs=sample -> sample.context,
 )
     (; inner_algorithm, iterations, epochs_per_iteration, α_decay) = algorithm
     (; statistical_model, maximizer) = policy
@@ -43,15 +43,14 @@ function train_policy!(
 
     # Initial dataset from expert demonstrations
     train_dataset = vcat(map(train_environments) do env
-        v, y = anticipative_policy(env; reset_env=true)
-        return y
+        return anticipative_policy(env; reset_env=true)
     end...)
 
     dataset = deepcopy(train_dataset)
 
     # Initialize combined history for all DAgger iterations
     combined_history = MVHistory()
-    global_epoch = 0
+    epoch_offset = 0
 
     for iter in 1:iterations
         println("DAgger iteration $iter/$iterations (α=$(round(α, digits=3)))")
@@ -68,50 +67,24 @@ function train_policy!(
 
         # Merge iteration history into combined history
         for key in keys(iter_history)
-            epochs, values = get(iter_history, key)
-            for i in eachindex(epochs)
-                # Calculate global epoch number
-                if iter == 1
-                    # First iteration: use epochs as-is [0, 1, 2, ...]
-                    global_epoch_value = epochs[i]
-                else
-                    # Later iterations: skip epoch 0 and renumber starting from global_epoch
-                    if epochs[i] == 0
-                        continue  # Skip epoch 0 for iterations > 1
-                    end
-                    # Map epoch 1 → global_epoch, epoch 2 → global_epoch+1, etc.
-                    global_epoch_value = global_epoch + epochs[i] - 1
-                end
-
-                # For the epoch key, use global_epoch_value as both time and value
-                # For other keys, use global_epoch_value as time and original value
-                if key == :epoch
-                    push!(combined_history, key, global_epoch_value, global_epoch_value)
-                else
-                    push!(combined_history, key, global_epoch_value, values[i])
-                end
+            local_epochs, values = get(iter_history, key)
+            for i in eachindex(local_epochs)
+                # Skip epoch 0 for all iterations after the first
+                local_epochs[i] == 0 && epoch_offset > 0 && continue
+                global_e = epoch_offset + local_epochs[i]
+                push!(combined_history, key, global_e, key == :epoch ? global_e : values[i])
             end
         end
 
-        # Update global_epoch for next iteration
-        # After each iteration, advance by the number of non-zero epochs processed
-        if iter == 1
-            # First iteration processes all epochs [0, 1, ..., epochs_per_iteration]
-            # Next iteration should start at epochs_per_iteration + 1
-            global_epoch = epochs_per_iteration + 1
-        else
-            # Subsequent iterations skip epoch 0, so they process epochs_per_iteration epochs
-            # Next iteration should start epochs_per_iteration later
-            global_epoch += epochs_per_iteration
-        end
+        epoch_offset += epochs_per_iteration
 
         # Dataset update - collect new samples using mixed policy
         new_samples = eltype(dataset)[]
         for env in train_environments
             DecisionFocusedLearningBenchmarks.reset!(env; reset_rng=false)
             while !is_terminated(env)
                 x_before = copy(observe(env)[1])
-                _, anticipative_solution = anticipative_policy(env; reset_env=false)
+                anticipative_solution = anticipative_policy(env; reset_env=false)
                 p = rand()
                 target = anticipative_solution[1]
                 x, state = observe(env)
@@ -149,25 +122,21 @@ This high-level function handles all setup from the benchmark and returns a trai
 """
 function train_policy(
     algorithm::DAgger,
-    benchmark::AbstractStochasticBenchmark{true};
+    benchmark::ExogenousDynamicBenchmark;
     dataset_size=30,
-    split_ratio=(0.3, 0.3, 0.4),
     metrics::Tuple=(),
     seed=0,
 )
-    # Generate dataset and environments
-    dataset = generate_dataset(benchmark, dataset_size)
-    train_instances, validation_instances, _ = splitobs(dataset; at=split_ratio)
-    train_environments = generate_environments(benchmark, train_instances; seed)
+    # Generate environments
+    train_environments = generate_environments(benchmark, dataset_size; seed)
 
     # Initialize model and create policy
     model = generate_statistical_model(benchmark)
     maximizer = generate_maximizer(benchmark)
     policy = DFLPolicy(model, maximizer)
 
     # Define anticipative policy from benchmark
-    anticipative_policy =
-        (env; reset_env) -> generate_anticipative_solution(benchmark, env; reset_env)
+    anticipative_policy = generate_anticipative_solver(benchmark)
 
     # Train policy
     history = train_policy!(
@@ -176,7 +145,7 @@ function train_policy(
         train_environments;
         anticipative_policy=anticipative_policy,
         metrics=metrics,
-        maximizer_kwargs=get_state,
+        maximizer_kwargs=sample -> sample.context,
     )
 
     return history, policy
diff --git a/src/algorithms/supervised/fyl.jl b/src/algorithms/supervised/fyl.jl
@@ -45,7 +45,7 @@ function train_policy!(
     train_dataset::DataLoader;
     epochs=100,
     metrics::Tuple=(),
-    maximizer_kwargs=get_info,
+    maximizer_kwargs=sample -> sample.context,
 )
     (; nb_samples, ε, threaded, training_optimizer, seed) = algorithm
     (; statistical_model, maximizer) = policy
@@ -106,7 +106,7 @@ function train_policy!(
     train_dataset::AbstractArray{<:DataSample};
     epochs=100,
     metrics::Tuple=(),
-    maximizer_kwargs=get_info,
+    maximizer_kwargs=sample -> sample.context,
 )
     data_loader = DataLoader(train_dataset; batchsize=1, shuffle=false)
     return train_policy!(
@@ -131,24 +131,32 @@ This high-level function handles all setup from the benchmark and returns a trai
 function train_policy(
     algorithm::PerturbedFenchelYoungLossImitation,
     benchmark::AbstractBenchmark;
+    target_policy=nothing,
     dataset_size=30,
     split_ratio=(0.3, 0.3),
     epochs=100,
     metrics::Tuple=(),
     seed=nothing,
 )
     # Generate dataset and split
-    dataset = generate_dataset(benchmark, dataset_size)
+    dataset = generate_dataset(benchmark, dataset_size; target_policy)
     train_instances, _, _ = splitobs(dataset; at=split_ratio)
 
+    if any(s -> isnothing(s.y), train_instances)
+        error(
+            "Training dataset contains unlabeled samples (y=nothing). " *
+            "Provide a `target_policy` kwarg to label samples during dataset generation.",
+        )
+    end
+
     # Initialize model and create policy
     model = generate_statistical_model(benchmark; seed)
     maximizer = generate_maximizer(benchmark)
     policy = DFLPolicy(model, maximizer)
 
     # Train policy
     history = train_policy!(
-        algorithm, policy, train_instances; epochs, metrics, maximizer_kwargs=get_info
+        algorithm, policy, train_instances; epochs, metrics, maximizer_kwargs=s -> s.context
     )
 
     return history, policy
diff --git a/src/metrics/accumulators.jl b/src/metrics/accumulators.jl
@@ -151,8 +151,8 @@ end
 Construct a FYLLossMetric for a given dataset.
 
 # Arguments
-- `dataset` - Dataset to evaluate on (should have samples with `.x`, `.y`, and `.info` fields)
-- `name::Symbol` - Identifier for the metric (default: `:fyl_loss`)
+- `dataset`: Dataset to evaluate on (should have samples with `.x`, `.y`, and `.context` fields)
+- `name::Symbol`: Identifier for the metric (default: `:fyl_loss`)
 """
 function FYLLossMetric(dataset, name::Symbol=:fyl_loss)
     return FYLLossMetric(dataset, LossAccumulator(name))
@@ -181,11 +181,11 @@ $TYPEDSIGNATURES
 Update the metric with a single loss computation.
 
 # Arguments
-- `metric::FYLLossMetric` - The metric to update
-- `loss::FenchelYoungLoss` - Loss function to use
-- `θ` - Model prediction
-- `y_target` - Target value
-- `kwargs...` - Additional arguments passed to loss function
+- `metric::FYLLossMetric`: The metric to update
+- `loss::FenchelYoungLoss`: Loss function to use
+- `θ`: Model prediction
+- `y_target`: Target value
+- `kwargs...`: Additional arguments passed to loss function
 """
 function update!(metric::FYLLossMetric, loss::FenchelYoungLoss, θ, y_target; kwargs...)
     l = loss(θ, y_target; kwargs...)
@@ -202,8 +202,8 @@ This method iterates through the dataset, computes predictions using `context.po
 and accumulates losses using `context.loss`. The dataset should be stored in the metric.
 
 # Arguments
-- `metric::FYLLossMetric` - The metric to evaluate
-- `context` - TrainingContext with `policy`, `loss`, and other fields
+- `metric::FYLLossMetric`: The metric to evaluate
+- `context::TrainingContext`: TrainingContext with `policy`, `loss`, and other fields
 """
 function evaluate!(metric::FYLLossMetric, context::TrainingContext)
     reset!(metric)
diff --git a/src/metrics/function_metric.jl b/src/metrics/function_metric.jl
@@ -18,21 +18,21 @@ epoch_metric = FunctionMetric(ctx -> ctx.epoch, :current_epoch)
 
 # Metric with stored data (dataset)
 gap_metric = FunctionMetric(:val_gap, val_data) do ctx, data
-    compute_gap(benchmark, data, ctx.model, ctx.maximizer)
+    compute_gap(benchmark, data, ctx.policy.statistical_model, ctx.policy.maximizer)
 end
 
 # Metric returning multiple values
 dual_gap = FunctionMetric(:gaps, (train_data, val_data)) do ctx, datasets
     train_ds, val_ds = datasets
     return (
-        train_gap = compute_gap(benchmark, train_ds, ctx.model, ctx.maximizer),
-        val_gap = compute_gap(benchmark, val_ds, ctx.model, ctx.maximizer)
+        train_gap = compute_gap(benchmark, train_ds, ctx.policy.statistical_model, ctx.policy.maximizer),
+        val_gap = compute_gap(benchmark, val_ds, ctx.policy.statistical_model, ctx.policy.maximizer)
     )
 end
 ```
 
 # See also
-- [`PeriodicMetric`](@ref) - Wrap a metric to evaluate periodically
+- [`PeriodicMetric`](@ref): Wrap a metric to evaluate periodically
 - [`evaluate!`](@ref)
 """
 struct FunctionMetric{F,D} <: AbstractMetric
@@ -52,8 +52,8 @@ Construct a FunctionMetric without stored data.
 The function should have signature `(context) -> value`.
 
 # Arguments
-- `metric_fn::Function` - Function to compute the metric
-- `name::Symbol` - Identifier for the metric
+- `metric_fn::Function`: Function to compute the metric
+- `name::Symbol`: Identifier for the metric
 """
 function FunctionMetric(metric_fn::F, name::Symbol) where {F}
     return FunctionMetric{F,Nothing}(metric_fn, name, nothing)
@@ -65,8 +65,8 @@ $TYPEDSIGNATURES
 Evaluate the function metric by calling the stored function.
 
 # Arguments
-- `metric::FunctionMetric` - The metric to evaluate
-- `context` - TrainingContext with current training state
+- `metric::FunctionMetric`: The metric to evaluate
+- `context::TrainingContext`: TrainingContext with current training state
 
 # Returns
 - The value returned by `metric.metric_fn` (can be single value or NamedTuple)
diff --git a/src/metrics/interface.jl b/src/metrics/interface.jl
@@ -4,7 +4,7 @@ $TYPEDEF
 Abstract base type for all metrics used during training.
 
 All concrete metric types should implement:
-- `evaluate!(metric, context)` - Evaluate the metric given a training context
+- `evaluate!(metric, context)`: Evaluate the metric given a training context
 
 # See also
 - [`LossAccumulator`](@ref)
@@ -20,14 +20,14 @@ abstract type AbstractMetric end
 Evaluate the metric given the current training context.
 
 # Arguments
-- `metric::AbstractMetric` - The metric to evaluate
-- `context::TrainingContext` - Current training state (model, epoch, maximizer, etc.)
+- `metric::AbstractMetric`: The metric to evaluate
+- `context::TrainingContext`: Current training state (model, epoch, maximizer, etc.)
 
 # Returns
 Can return:
-- A single value (Float64, Int, etc.) - stored with `metric.name`
-- A `NamedTuple` - each key-value pair stored separately
-- `nothing` - skipped (e.g., periodic metrics on off-epochs)
+- A single value (Float64, Int, etc.): stored with `metric.name`
+- A `NamedTuple`: each key-value pair stored separately
+- `nothing`: skipped (e.g., periodic metrics on off-epochs)
 """
 function evaluate! end
 
@@ -89,9 +89,9 @@ This function handles three types of metric returns through multiple dispatch:
 - **nothing**: Skipped (e.g., periodic metrics on epochs when not evaluated)
 
 # Arguments
-- `history::MVHistory` - MVHistory object to store metric values
-- `metrics::Tuple` - Tuple of AbstractMetric instances to evaluate
-- `context::TrainingContext` - TrainingContext with current training state (policy, epoch, etc.)
+- `history::MVHistory`: MVHistory object to store metric values
+- `metrics::Tuple`: Tuple of AbstractMetric instances to evaluate
+- `context::TrainingContext`: TrainingContext with current training state (policy, epoch, etc.)
 
 # Examples
 ```julia
diff --git a/src/metrics/periodic.jl b/src/metrics/periodic.jl
@@ -74,8 +74,8 @@ $TYPEDSIGNATURES
 Evaluate the wrapped metric only if the current epoch matches the frequency pattern.
 
 # Arguments
-- `pm::PeriodicMetric` - The periodic metric wrapper
-- `context` - TrainingContext with current epoch
+- `pm::PeriodicMetric`: The periodic metric wrapper
+- `context::TrainingContext`: TrainingContext with current epoch
 
 # Returns
 - The result of `evaluate!(pm.metric, context)` if epoch matches the pattern
diff --git a/src/policies/dfl_policy.jl b/src/policies/dfl_policy.jl
@@ -22,3 +22,17 @@ function (p::DFLPolicy)(features::AbstractArray; kwargs...)
     y = p.maximizer(θ; kwargs...)
     return y
 end
+
+"""
+$TYPEDSIGNATURES
+
+Convenience overload: evaluate the optimality gap using a [`DFLPolicy`](@ref) directly,
+instead of unpacking `policy.statistical_model` and `policy.maximizer`.
+"""
+function DecisionFocusedLearningBenchmarks.compute_gap(
+    bench, dataset, policy::DFLPolicy, op=mean
+)
+    return DecisionFocusedLearningBenchmarks.compute_gap(
+        bench, dataset, policy.statistical_model, policy.maximizer, op
+    )
+end
diff --git a/src/training_context.jl b/src/training_context.jl
diff --git a/src/utils.jl b/src/utils.jl
diff --git a/test/dagger.jl b/test/dagger.jl