add replay buffer option to dagger + bugfix

BatyLeo · BatyLeo · commit 52e80c72fac5 · 2026-03-27T15:34:27.000+01:00
diff --git a/src/DecisionFocusedLearningAlgorithms.jl b/src/DecisionFocusedLearningAlgorithms.jl
@@ -6,6 +6,7 @@ using Flux: Flux, Adam
 using InferOpt: InferOpt, FenchelYoungLoss, PerturbedAdditive, PerturbedMultiplicative
 using MLUtils: splitobs, DataLoader
 using ProgressMeter: @showprogress
+using Random: Random, MersenneTwister
 using Statistics: mean
 using UnicodePlots: lineplot
 using ValueHistories: MVHistory
diff --git a/src/algorithms/supervised/dagger.jl b/src/algorithms/supervised/dagger.jl
@@ -8,7 +8,7 @@ Reference: <https://arxiv.org/abs/2402.04463>
 # Fields
 $TYPEDFIELDS
 """
-@kwdef struct DAgger{A} <: AbstractImitationAlgorithm
+@kwdef struct DAgger{A,S} <: AbstractImitationAlgorithm
     "inner imitation algorithm for supervised learning"
     inner_algorithm::A = PerturbedFenchelYoungLossImitation()
     "number of DAgger iterations"
@@ -17,6 +17,11 @@ $TYPEDFIELDS
     epochs_per_iteration::Int = 3
     "decay factor for mixing expert and learned policy"
     α_decay::Float64 = 0.9
+    "random seed for the expert/policy mixing coin-flip (nothing = non-reproducible)"
+    seed::S = nothing
+    "maximum dataset size across iterations (nothing keeps all samples,
+    an integer caps to the most recent N samples via FIFO)"
+    max_dataset_size::Union{Int,Nothing} = nothing
 end
 
 """
@@ -36,9 +41,10 @@ function train_policy!(
     metrics::Tuple=(),
     maximizer_kwargs=sample -> sample.context,
 )
-    (; inner_algorithm, iterations, epochs_per_iteration, α_decay) = algorithm
+    (; inner_algorithm, iterations, epochs_per_iteration, α_decay, seed) = algorithm
     (; statistical_model, maximizer) = policy
 
+    rng = isnothing(seed) ? MersenneTwister() : MersenneTwister(seed)
     α = 1.0
 
     # Initial dataset from expert demonstrations
@@ -85,7 +91,7 @@ function train_policy!(
             while !is_terminated(env)
                 x_before = copy(observe(env)[1])
                 anticipative_solution = anticipative_policy(env; reset_env=false)
-                p = rand()
+                p = rand(rng)
                 target = anticipative_solution[1]
                 x, state = observe(env)
                 if size(target.x) != size(x)
@@ -104,7 +110,10 @@ function train_policy!(
                 step!(env, action)
             end
         end
-        dataset = new_samples  # TODO: replay buffer
+        dataset = vcat(dataset, new_samples)
+        if !isnothing(algorithm.max_dataset_size)
+            dataset = last(dataset, algorithm.max_dataset_size)
+        end
         α *= α_decay  # Decay factor for mixing expert and learned policy
     end
 
diff --git a/src/metrics/interface.jl b/src/metrics/interface.jl
@@ -43,15 +43,7 @@ Internal helper to store a single metric value in the history.
 function _store_metric_value!(
     history::MVHistory, metric_name::Symbol, epoch::Int, value::Number
 )
-    try
-        push!(history, metric_name, epoch, value)
-    catch e
-        throw(
-            ErrorException(
-                "Failed to store metric '$metric_name' at epoch $epoch: $(e.msg)"
-            ),
-        )
-    end
+    push!(history, metric_name, epoch, value)
     return nothing
 end
 
@@ -81,6 +73,19 @@ end
 """
 $TYPEDSIGNATURES
 
+Fallback that throws a descriptive error for unsupported return types.
+Metrics must return a `Number`, a `NamedTuple`, or `nothing`.
+"""
+function _store_metric_value!(::MVHistory, metric_name::Symbol, ::Int, value)
+    return error(
+        "Metric `$metric_name` returned a value of type $(typeof(value)), which cannot " *
+        "be stored in history. Metrics must return a Number, a NamedTuple, or nothing."
+    )
+end
+
+"""
+$TYPEDSIGNATURES
+
 Evaluate all metrics and store their results in the history.
 
 This function handles three types of metric returns through multiple dispatch:
diff --git a/src/metrics/periodic.jl b/src/metrics/periodic.jl
@@ -10,7 +10,7 @@ This is useful for expensive metrics that don't need to be computed every epoch
 $TYPEDFIELDS
 
 # Behavior
-The metric is evaluated when `(epoch - offset) % frequency == 0`.
+The metric is evaluated when `epoch >= offset` and `(epoch - offset) % frequency == 0`.
 On other epochs, `evaluate!` returns `nothing` (which is skipped by `evaluate_metrics!`).
 
 # See also
@@ -82,7 +82,7 @@ Evaluate the wrapped metric only if the current epoch matches the frequency patt
 - `nothing` otherwise (which is skipped by `evaluate_metrics!`)
 """
 function evaluate!(pm::PeriodicMetric, context)
-    if (context.epoch - pm.offset) % pm.frequency == 0
+    if context.epoch >= pm.offset && (context.epoch - pm.offset) % pm.frequency == 0
         return evaluate!(pm.metric, context)
     else
         return nothing  # Skip evaluation on this epoch
diff --git a/test/dagger.jl b/test/dagger.jl
@@ -69,6 +69,20 @@ using ValueHistories
         @test policy.statistical_model !== nothing
         @test haskey(history, :training_loss)
     end
+
+    @testset "DAgger - max_dataset_size cap" begin
+        algorithm = DAgger(; iterations=2, epochs_per_iteration=1, max_dataset_size=10)
+        model = generate_statistical_model(benchmark)
+        maximizer = generate_maximizer(benchmark)
+        policy = DFLPolicy(model, maximizer)
+        anticipative_policy = generate_anticipative_solver(benchmark)
+
+        history = train_policy!(
+            algorithm, policy, train_envs; anticipative_policy=anticipative_policy
+        )
+        @test history isa MVHistory
+        @test haskey(history, :training_loss)
+    end
 end
 
 @testset "Integration Tests" begin
diff --git a/test/fyl.jl b/test/fyl.jl
@@ -138,4 +138,23 @@ using ValueHistories
         _, epoch_sq_values = get(history, :epoch_squared)
         @test epoch_sq_values == [0.0, 1.0, 4.0, 9.0]
     end
+
+    @testset "PeriodicMetric offset guard" begin
+        model = generate_statistical_model(benchmark)
+        maximizer = generate_maximizer(benchmark)
+        policy = DFLPolicy(model, maximizer)
+        algorithm = PerturbedFenchelYoungLossImitation()
+
+        fired_at = Int[]
+        probe = FunctionMetric(ctx -> (push!(fired_at, ctx.epoch); nothing), :probe)
+        # offset=5: should fire at epochs 5, 10, ... but NOT at epoch 0
+        periodic = PeriodicMetric(probe, 5; offset=5)
+
+        train_policy!(algorithm, policy, train_data; epochs=10, metrics=(periodic,))
+
+        @test 0 ∉ fired_at   # must not fire before offset
+        @test 5 ∈ fired_at   # must fire at offset
+        @test 10 ∈ fired_at  # must fire at offset + frequency
+        @test 3 ∉ fired_at   # must not fire between
+    end
 end