diff --git a/docs/make.jl b/docs/make.jl
index 4a1ec1b..b33f305 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -5,11 +5,16 @@ using Literate
 md_dir = joinpath(@__DIR__, "src")
 tutorial_dir = joinpath(@__DIR__, "src", "tutorials")
 benchmarks_dir = joinpath(@__DIR__, "src", "benchmarks")
-api_dir = joinpath(@__DIR__, "src", "api")
 
 tutorial_files = readdir(tutorial_dir)
 md_tutorial_files = [split(file, ".")[1] * ".md" for file in tutorial_files]
-benchmark_files = [joinpath("benchmarks", e) for e in readdir(benchmarks_dir)]
+
+categories = [
+    "Toy problems" => "toy",
+    "Static problems" => "static",
+    "Stochastic problems" => "stochastic",
+    "Dynamic problems" => "dynamic",
+]
 
 include_tutorial = true
 
@@ -20,6 +25,19 @@ if include_tutorial
     end
 end
 
+benchmark_sections = Pair{String,Vector{String}}[]
+
+for (label, subdir) in categories
+    dir = joinpath(benchmarks_dir, subdir)
+    jl_files = filter(f -> endswith(f, ".jl"), readdir(dir))
+    md_names = [splitext(f)[1] * ".md" for f in jl_files]
+    for file in jl_files
+        Literate.markdown(joinpath(dir, file), dir; documenter=true, execute=false)
+    end
+    md_paths = [joinpath("benchmarks", subdir, f) for f in md_names]
+    push!(benchmark_sections, label => md_paths)
+end
+
 makedocs(;
     modules=[DecisionFocusedLearningBenchmarks],
     authors="Members of JuliaDecisionFocusedLearning",
@@ -32,7 +50,7 @@ makedocs(;
             "Creating custom benchmarks" => "custom_benchmarks.md",
         ],
         "Tutorials" => include_tutorial ? md_tutorial_files : [],
-        "Benchmark problems list" => benchmark_files,
+        "Benchmarks" => benchmark_sections,
         "API reference" => "api.md",
     ],
 )
@@ -44,6 +62,13 @@ if include_tutorial
     end
 end
 
+for (_, subdir) in categories
+    dir = joinpath(benchmarks_dir, subdir)
+    for f in filter(f -> endswith(f, ".md"), readdir(dir))
+        rm(joinpath(dir, f); force=true)
+    end
+end
+
 deploydocs(;
     repo="github.com/JuliaDecisionFocusedLearning/DecisionFocusedLearningBenchmarks.jl",
     devbranch="main",
diff --git a/docs/src/benchmarks/argmax.md b/docs/src/benchmarks/argmax.md
deleted file mode 100644
index 00a5e67..0000000
--- a/docs/src/benchmarks/argmax.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Argmax
-
-!!! warning
-    Documentation for this benchmark is still under development. Please refer to the source code and API for more details.
diff --git a/docs/src/benchmarks/contextual_stochastic_argmax.md b/docs/src/benchmarks/contextual_stochastic_argmax.md
deleted file mode 100644
index 59f588f..0000000
--- a/docs/src/benchmarks/contextual_stochastic_argmax.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# Contextual Stochastic Argmax
-
-[`ContextualStochasticArgmaxBenchmark`](@ref) is a minimalist contextual stochastic optimization benchmark problem.
-
-The decision maker selects one item out of ``n``. Item values are uncertain at decision time: they depend on a base utility plus a context-correlated perturbation revealed only after the decision is made. An observable context vector, correlated with the perturbation via a fixed linear map ``W``, allows the learner to anticipate the perturbation and pick the right item.
-
-## Problem Formulation
-
-**Instance**: ``c_{\text{base}} \sim \mathcal{U}[0,1]^n``, base values for ``n`` items.
-
-**Context**: ``x_{\text{raw}} \sim \mathcal{N}(0, I_d)``, a ``d``-dimensional signal correlated with item values. The feature vector passed to the model is ``x = [c_{\text{base}};\, x_{\text{raw}}] \in \mathbb{R}^{n+d}``.
-
-**Scenario**: the realized item values are
-```math
-\xi = c_{\text{base}} + W x_{\text{raw}} + \varepsilon, \quad \varepsilon \sim \mathcal{N}(0, \sigma^2 I_n)
-```
-where ``W \in \mathbb{R}^{n \times d}`` is a fixed matrix unknown to the learner.
-
-**Decision**: ``y \in \{e_1, \ldots, e_n\}`` (one-hot vector selecting one item).
-
-## Policies
-
-### DFL Policy
-
-```math
-\xrightarrow[\text{Features}]{x}
-\fbox{Neural network $\varphi_w$}
-\xrightarrow[\text{Predicted values}]{\hat{\theta}}
-\fbox{\texttt{one\_hot\_argmax}}
-\xrightarrow[\text{Decision}]{y}
-```
-
-The neural network predicts item values ``\hat{\theta} \in \mathbb{R}^n`` from the feature vector ``x \in \mathbb{R}^{n+d}``. The default architecture is `Dense(n+d => n; bias=false)`, which can exactly recover the optimal linear predictor ``[I_n \mid W]``, so a well-trained model should reach near-zero gap.
-
-### SAA Policy
-
-``y_{\text{SAA}} = \operatorname{argmax}\bigl(\frac{1}{S}\sum_s \xi^{(s)}\bigr)`` — the exact SAA-optimal decision for linear argmax, accessible via `generate_baseline_policies(bench).saa`.
diff --git a/docs/src/benchmarks/dvsp.md b/docs/src/benchmarks/dvsp.md
deleted file mode 100644
index 2282597..0000000
--- a/docs/src/benchmarks/dvsp.md
+++ /dev/null
@@ -1,145 +0,0 @@
-# Dynamic Vehicle Scheduling
-
-The Dynamic Vehicle Scheduling Problem (DVSP) is a sequential decision-making problem where an agent must dynamically dispatch vehicles to serve customers that arrive over time.
-
-## Problem Description
-
-### Overview
-
-In the dynamic vehicle scheduling problem, a fleet operator must decide at each time step which customer to serve immediately and which to postpone to future time steps.
-The goal is to serve all customers by the end of the planning horizon while minimizing total travel time.
-
-This is a simplified version of the more complex Dynamic Vehicle Routing Problem with Time Windows (DVRPTW), focusing on the core sequential decision-making aspects without capacity or time window constraints.
-
-The problem is characterized by:
-- **Exogenous noise**: customer arrivals are stochastic and follow a fixed known distribution, independent of the agent's actions
-- **Combinatorial action space**: at each time step, the agent must build vehicle routes to serve selected customers, which leads to a huge combinatorial action space
-
-### Mathematical Formulation
-
-The dynamic vehicle scheduling problem can be formulated as a finite-horizon Markov Decision Process (MDP):
-
-**State Space** ``\mathcal{S}``: At time step ``t``, the state ``s_t`` consists of:
-```math
-s_t = (R_t, D_t, t)
-```
-where:
-- ``R_t`` are the pending customer (not yet served), where each customer ``r_i \in R_t`` contains:
-  - ``x_i, y_i``: 2d spatial coordinates of the customer location
-  - ``\tau_i``: start time when the customer needs to be served
-  - ``s_i``: service time required to serve the customer
-- ``D_t`` indicates which customers must be dispatched this time step (i.e. that cannot be postponed further, otherwise they will be infeasible at the next time step because of their start time)
-- ``t \in \{1, 2, \ldots, T\}`` is the current time step
-
-The state also implicitly includes (constant over time):
-- Travel duration matrix ``d_{ij}``: time to travel from location ``i`` to location ``j``
-- Depot location
-
-**Action Space** ``\mathcal{A}(s_t)``: The action at time step ``t`` is a set of vehicle routes:
-```math
-a_t = \{r_1, r_2, \ldots, r_k\}
-```
-where each route ``r_i`` is a sequence of customer that starts and ends at the depot.
-
-A route is feasible if:
-- It starts and ends at the depot
-- It follows time constraints, i.e. customers are served on time
-
-**Transition Dynamics** ``\mathcal{P}(s_{t+1} | s_t, a_t)``: After executing routes ``a_t``:
-
-1. **Remove served customers** from the pending customer set
-2. **Generate new customer arrivals** according to the underlying exogenous distribution
-3. **Update must-dispatch set** based on postponement rules
-
-**Reward Function** ``r(s_t, a_t)``: The immediate reward is the negative total travel time of the routes:
-
-```math
-r(s_t, a_t) = - \sum_{r \in a_t} \sum_{(i,j) \in r} d_{ij}
-```
-
-where ``d_{ij}`` is the travel duration from location ``i`` to location ``j``, and the sum is over all consecutive location pairs in each route ``r``.
-
-**Objective**: Find a policy ``\pi: \mathcal{S} \to \mathcal{A}`` that maximizes expected cumulative reward:
-```math
-\max_\pi \mathbb{E}\left[\sum_{t=1}^T r(s_t, \pi(s_t)) \right]
-```
-
-## Key Components
-
-### [`DynamicVehicleSchedulingBenchmark`](@ref)
-
-The main benchmark configuration with the following parameters:
-
-- `max_requests_per_epoch`: Maximum number of new customers per time step (default: 10)
-- `Δ_dispatch`: Time delay between decision and vehicle dispatch (default: 1.0)
-- `epoch_duration`: Duration of each decision time step (default: 1.0)
-- `two_dimensional_features`: Whether to use simplified 2D features instead of full feature set (default: false)
-
-### Instance Generation
-
-Problem instances are generated from static vehicle routing datasets and include:
-
-- **Customer locations**: Spatial coordinates for pickup/delivery points
-- **Depot location**: Central starting and ending point for all routes
-- **Travel times**: Distance/duration matrix between all location pairs
-- **Service times**: Service time each customer
-
-The dynamic version samples new customer arrivals from the static instance, drawing new customers by independently sampling:
-- their locations from the set of static customer locations
-- service times, uniformly from the range of service times in the static instance
-
-### Features
-
-The benchmark provides two feature matrix representations, containing one column per postponable customer in the state:
-
-**Full Features** (27-dimensional):
-- Start times for postponable customers (1)
-- End times (start + service time) (2)
-- Travel time from depot to customer (3)
-- Travel time from customer to depot (4)
-- Slack time until next time step (5)
-- % of must-dispatch customers that can reach this customer on time (6)
-- % of customers reachable from this customer on time (7)
-- % of customers that can reach this customer on time (8)
-- % of customers reachable or that can reach this customer on time (9)
-- Quantile-based travel times to other customers (9 quantiles) (10-18)
-- Quantiles of % of reachable new customers (9 quantiles) (19-27)
-
-**2D Features** (simplified):
-- Travel time from depot to customer (1)
-- Mean travel time to other customers (2)
-
-## Benchmark Policies
-
-### Lazy Policy
-
-The lazy policy postpones all possible customers, serving only those that must be dispatched.
-
-### Greedy Policy  
-
-The greedy policy serves all pending customers as soon as they arrive, without considering future consequences. 
-
-## Decision-Focused Learning Policy
-
-```math
-\xrightarrow[\text{State}]{s_t}
-\fbox{Neural network $\varphi_w$}
-\xrightarrow[\text{Prizes}]{\theta}
-\fbox{Prize-collecting VSP}
-\xrightarrow[\text{Routes}]{a_t}
-```
-
-**Components**:
-
-1. **Neural Network** ``\varphi_w``: Takes current state features as input and predicts customer prizes ``\theta = (\theta_1, \ldots, \theta_n)``, one value per postponable customer.
-2. **Optimization Layer**: Solves the prize-collecting vehicle scheduling problem to determine optimal routes given the predicted prizes, by maximizing total collected prizes minus travel costs:
-    ```math
-    \max_{a_t\in \mathcal{A}(s_t)} \sum_{r \in a_t} \left( \sum_{i \in r} \theta_i - \sum_{(i,j) \in r} d_{ij} \right)
-    ```
-    This can be modeled as a flow linear program on a directed acyclic graph (DAG) and is solved using standard LP solvers.
-
-The neural network architecture adapts to the feature dimensionality:
-- **2D features**: `Dense(2 => 1)`, applied in parallel to each postponable customer
-- **Full features**: `Dense(27 => 1)` applied in parallel to each postponable customer
-
-**Note:** one can also use more complex architectures such as a deeper MLP or a graph neural network for better performance.
diff --git a/docs/src/benchmarks/dynamic/dvsp.jl b/docs/src/benchmarks/dynamic/dvsp.jl
new file mode 100644
index 0000000..9c86f5e
--- /dev/null
+++ b/docs/src/benchmarks/dynamic/dvsp.jl
@@ -0,0 +1,115 @@
+# # Dynamic Vehicle Scheduling
+# Dispatch vehicles to customers arriving over time: at each step the agent decides which
+# customers to serve now and which to postpone, minimizing total travel cost.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = DynamicVehicleSchedulingBenchmark()
+
+# ## A sample episode
+#
+# Generate one environment and roll it out with the greedy policy (serves all pending
+# customers immediately):
+policies = generate_baseline_policies(b)
+env = generate_environments(b, 1)[1]
+_, trajectory = evaluate_policy!(policies.greedy, env)
+
+# One step: depot (green square), must-dispatch customers (red stars; deadline reached),
+# postponable customers (blue triangles), vehicle routes (lines):
+plot_solution(b, trajectory[1])
+
+# Multiple steps side by side — customers accumulate and routes change over time:
+plot_trajectory(b, trajectory[1:min(3, length(trajectory))])
+
+# ## DFL pipeline components
+
+# The DFL agent chains two components: a neural network predicting a prize per customer:
+model = generate_statistical_model(b)     # Dense(27 → 1) per customer: state features → prize
+# and a maximizer selecting routes that balance collected prizes against travel costs:
+maximizer = generate_maximizer(b)         # prize-collecting VSP solver
+
+# At each step, the model assigns a prize to each postponable customer. The solver then
+# selects routes maximizing collected prizes minus travel costs, deciding which customers
+# to serve now and which to defer.
+
+# ---
+# ## Problem Description
+#
+# ### Overview
+#
+# In the **Dynamic Vehicle Scheduling Problem (DVSP)**, a fleet operator must decide at
+# each time step which customers to serve immediately and which to postpone. The goal is
+# to serve all customers by end of the planning horizon while minimizing total travel time.
+#
+# The problem is characterized by:
+# - **Exogenous noise**: customer arrivals are stochastic and follow a fixed distribution
+# - **Combinatorial action space**: routes are built over a large set of customers
+#
+# ### Mathematical Formulation
+#
+# **State** ``s_t = (R_t, D_t, t)`` where:
+# - ``R_t``: pending customers, each with coordinates, start time, service time
+# - ``D_t``: must-dispatch customers (cannot be postponed further)
+# - ``t``: current time step
+#
+# **Action** ``a_t``: a set of vehicle routes ``\{r_1, r_2, \ldots, r_k\}``, each starting
+# and ending at the depot, satisfying time constraints.
+#
+# **Reward:**
+# ```math
+# r(s_t, a_t) = -\sum_{r \in a_t} \sum_{(i,j) \in r} d_{ij}
+# ```
+#
+# **Objective:**
+# ```math
+# \max_\pi \; \mathbb{E}\!\left[\sum_{t=1}^T r(s_t, \pi(s_t))\right]
+# ```
+#
+# ## Key Components
+#
+# ### [`DynamicVehicleSchedulingBenchmark`](@ref)
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `max_requests_per_epoch` | Maximum new customers per time step | 10 |
+# | `Δ_dispatch` | Time delay between decision and dispatch | 1.0 |
+# | `epoch_duration` | Duration of each time step | 1.0 |
+# | `two_dimensional_features` | Use 2D instead of full 27D features | `false` |
+#
+# ### Features
+#
+# **Full features (27D per customer):** start/end times, depot travel times, slack,
+# reachability ratios, quantile-based travel times to other customers.
+#
+# **2D features:** travel time from depot + mean travel time to others.
+#
+# ## Baseline Policies
+#
+# | Policy | Description |
+# |--------|-------------|
+# | Lazy | Postpones all possible customers; serves only must-dispatch |
+# | Greedy | Serves all pending customers immediately |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{State}]{s_t}
+# \fbox{Neural network $\varphi_w$}
+# \xrightarrow[\text{Prizes}]{\theta}
+# \fbox{Prize-collecting VSP}
+# \xrightarrow[\text{Routes}]{a_t}
+# ```
+#
+# The neural network predicts a prize ``\theta_i`` for each postponable customer.
+# The prize-collecting VSP solver then maximizes collected prizes minus travel costs:
+# ```math
+# \max_{a_t \in \mathcal{A}(s_t)} \sum_{r \in a_t} \left(\sum_{i \in r} \theta_i - \sum_{(i,j) \in r} d_{ij}\right)
+# ```
+#
+# **Model:**
+# - 2D features: `Dense(2 → 1)` applied independently per customer
+# - Full features: `Dense(27 → 1)` applied independently per customer
+#
+# !!! note "Reference"
+#     TODO: add original reference.
diff --git a/docs/src/benchmarks/dynamic/dynamic_assortment.jl b/docs/src/benchmarks/dynamic/dynamic_assortment.jl
new file mode 100644
index 0000000..9d00d3c
--- /dev/null
+++ b/docs/src/benchmarks/dynamic/dynamic_assortment.jl
@@ -0,0 +1,112 @@
+# # Dynamic Assortment
+# Select which K items to offer at each step to maximize revenue: customer preferences
+# evolve dynamically based on purchase history (hype and saturation effects).
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = DynamicAssortmentBenchmark()
+
+# ## A sample episode
+#
+# Generate one environment and roll out with the greedy policy (offers the K highest-priced
+# items at every step):
+policies = generate_baseline_policies(b)
+env = generate_environments(b, 1)[1]
+_, trajectory = evaluate_policy!(policies.greedy, env)
+
+# One step: bar chart of item prices, green = items in the offered assortment:
+plot_solution(b, trajectory[1])
+
+# A few steps side by side (prices are fixed; assortment composition changes over time):
+plot_trajectory(b, trajectory[1:min(4, length(trajectory))])
+
+# ## DFL pipeline components
+
+# The DFL agent chains two components: a neural network predicting utility scores per item:
+model = generate_statistical_model(b)     # MLP: state features → predicted utility per item
+# and a maximizer offering the K items with the highest predicted utilities:
+maximizer = generate_maximizer(b)         # top-K selection by predicted utility
+
+# At each step, the model maps the current state (prices, hype, saturation, history) to a
+# utility score per item. The maximizer selects the K items with the highest scores.
+
+# ---
+# ## Problem Description
+#
+# ### Overview
+#
+# In the **Dynamic Assortment problem**, a retailer has ``N`` items and must select
+# ``K`` to offer at each time step. Customer preferences evolve based on purchase history
+# through **hype** (recent purchases increase demand) and **saturation** (repeated
+# purchases slightly decrease demand).
+#
+# ### Mathematical Formulation
+#
+# **State** ``s_t = (p, f, h_t, \sigma_t, t, \mathcal{H}_t)`` where:
+# - ``p``: fixed item prices
+# - ``f``: static item features
+# - ``h_t, \sigma_t``: current hype and saturation levels
+# - ``t``: current time step
+# - ``\mathcal{H}_t``: purchase history (last 5 purchases)
+#
+# **Action:** ``a_t \subseteq \{1,\ldots,N\}`` with ``|a_t| = K``
+#
+# **Customer choice** (multinomial logit):
+# ```math
+# \mathbb{P}(i \mid a_t, s_t) = \frac{\exp(\theta_i(s_t))}{\sum_{j \in a_t} \exp(\theta_j(s_t)) + 1}
+# ```
+#
+# **Transition dynamics:**
+# - Hype: ``h_{t+1}^{(i)} = h_t^{(i)} \times m^{(i)}`` where the multiplier reflects recent purchases
+# - Saturation: increases by ×1.01 for the purchased item
+#
+# **Reward:** ``r(s_t, a_t) = p_{i^\star}`` (price of the purchased item, 0 if no purchase)
+#
+# **Objective:**
+# ```math
+# \max_\pi \; \mathbb{E}\!\left[\sum_{t=1}^T r(s_t, \pi(s_t))\right]
+# ```
+#
+# ## Key Components
+#
+# ### [`DynamicAssortmentBenchmark`](@ref)
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `N` | Number of items in catalog | 20 |
+# | `d` | Static feature dimension per item | 2 |
+# | `K` | Assortment size | 4 |
+# | `max_steps` | Steps per episode | 80 |
+# | `exogenous` | Whether dynamics are exogenous | `false` |
+#
+# ### State Observation
+#
+# Agents observe a ``(d+8) \times N`` normalized feature matrix per step containing:
+# current prices, hype, saturation, static features, change in hype/saturation from
+# previous step and from initial state, and normalized time step.
+#
+# ## Baseline Policies
+#
+# | Policy | Description |
+# |--------|-------------|
+# | Expert | Brute-force enumeration of all ``\binom{N}{K}`` subsets; optimal but slow |
+# | Greedy | Selects the ``K`` items with highest prices |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{State}]{s_t}
+# \fbox{Neural network $\varphi_w$}
+# \xrightarrow[\text{Utilities}]{\theta \in \mathbb{R}^N}
+# \fbox{Top-K}
+# \xrightarrow[\text{Assortment}]{a_t}
+# ```
+#
+# **Model:** `Chain(Dense(d+8 → 5), Dense(5 → 1), vec)` — predicts one utility score
+# per item from the current state features.
+#
+# **Maximizer:** `TopKMaximizer(K)` — selects the top ``K`` items by predicted utility.
+#
+# !!! note "Reference"
+#     [Structured Reinforcement Learning for Combinatorial Decision-Making](https://arxiv.org/abs/2505.19053)
diff --git a/docs/src/benchmarks/dynamic/maintenance.jl b/docs/src/benchmarks/dynamic/maintenance.jl
new file mode 100644
index 0000000..a9205de
--- /dev/null
+++ b/docs/src/benchmarks/dynamic/maintenance.jl
@@ -0,0 +1,105 @@
+# # Maintenance
+# Decide which components to maintain at each step to minimize failure and maintenance costs:
+# components degrade stochastically and the agent has limited maintenance capacity.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = MaintenanceBenchmark(; N=5, K=2)  # 5 components, maintain up to 2 per step
+
+# ## A sample episode
+#
+# Generate one environment and roll out with the greedy policy (maintains the most degraded
+# components up to capacity):
+policies = generate_baseline_policies(b)
+env = generate_environments(b, 1)[1]
+_, trajectory = evaluate_policy!(policies.greedy, env)
+
+# One step: bars show degradation levels (1 = new, n = failed), green = maintained, red = failed:
+plot_solution(b, trajectory[1])
+
+# A few steps side by side showing degradation evolving over time:
+plot_trajectory(b, trajectory[1:min(4, length(trajectory))])
+
+# ## DFL pipeline components
+
+# The DFL agent chains two components: a neural network predicting urgency scores per component:
+model = generate_statistical_model(b)     # two-layer MLP: degradation state → urgency scores
+# and a maximizer selecting the most urgent components for maintenance:
+maximizer = generate_maximizer(b)         # top-K selection among components with positive scores
+
+# At each step, the model maps the current degradation state to an urgency score per component.
+# The maximizer selects up to K components with the highest positive scores for maintenance.
+
+# ---
+# ## Problem Description
+#
+# ### Overview
+#
+# In the **Maintenance benchmark**, a system has ``N`` identical components, each with
+# ``n`` discrete degradation states (1 = new, ``n`` = failed). At each step, the agent
+# can maintain up to ``K`` components. Maintained components are reset to state 1.
+# Unmaintained components degrade stochastically.
+#
+# ### Mathematical Formulation
+#
+# **State** ``s_t \in \{1,\ldots,n\}^N``: degradation level of each component.
+#
+# **Action** ``a_t \subseteq \{1,\ldots,N\}`` with ``|a_t| \leq K``
+#
+# **Transition dynamics:** For each component ``i``:
+# - If maintained: ``s_{t+1}^i = 1``
+# - If not maintained: ``s_{t+1}^i = \min(s_t^i + 1, n)`` with probability ``p``, else ``s_t^i``
+#
+# **Cost:**
+# ```math
+# c(s_t, a_t) = c_m \cdot |a_t| + c_f \cdot \#\{i : s_t^i = n\}
+# ```
+#
+# **Objective:**
+# ```math
+# \min_\pi \; \mathbb{E}\!\left[\sum_{t=1}^T c(s_t, \pi(s_t))\right]
+# ```
+#
+# ## Key Components
+#
+# ### [`MaintenanceBenchmark`](@ref)
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `N` | Number of components | 2 |
+# | `K` | Max simultaneous maintenance operations | 1 |
+# | `n` | Degradation levels per component | 3 |
+# | `p` | Degradation probability per step | 0.2 |
+# | `c_f` | Failure cost per failed component | 10.0 |
+# | `c_m` | Maintenance cost per maintained component | 3.0 |
+# | `max_steps` | Steps per episode | 80 |
+#
+# ### Instance Generation
+#
+# Each instance has random starting degradation states uniformly drawn from ``\{1,\ldots,n\}``.
+#
+# ## Baseline Policies
+#
+# | Policy | Description |
+# |--------|-------------|
+# | Greedy | Maintains components in the last degradation state before failure, up to capacity |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{State}]{s_t \in \{1,\ldots,n\}^N}
+# \fbox{Neural network $\varphi_w$}
+# \xrightarrow[\text{Scores}]{\theta \in \mathbb{R}^N}
+# \fbox{Top-K (positive)}
+# \xrightarrow[\text{Maintenance}]{a_t}
+# ```
+#
+# **Model:** `Chain(Dense(N → N), Dense(N → N), vec)` — two-layer MLP predicting one
+# urgency score per component.
+#
+# **Maximizer:** `TopKPositiveMaximizer(K)` — selects the ``K`` components with the
+# highest positive scores for maintenance.
+#
+# !!! note "Reference"
+#     TODO: add original reference.
diff --git a/docs/src/benchmarks/dynamic_assortment.md b/docs/src/benchmarks/dynamic_assortment.md
deleted file mode 100644
index 6f5264c..0000000
--- a/docs/src/benchmarks/dynamic_assortment.md
+++ /dev/null
@@ -1,158 +0,0 @@
-# Dynamic Assortment
-
-The Dynamic Assortment problem is a sequential decision-making benchmark where an agent must repeatedly select which subset of items to offer to customers over time. The goal is to maximize total revenue while accounting for dynamic customer preferences that evolve based on purchase history.
-
-## Problem Description
-
-### Overview
-
-In the dynamic assortment problem, a retailer has access to a catalog of ``N`` items and must decide which subset of exactly ``K`` items to offer to customers at each time step. Customers make purchasing decisions according to a choice model that depends on public features ``x``:
-
-- **Item prices**: Fixed monetary cost of each item
-- **Item features**: Static characteristics of each item (size ``d``)
-- **Hype**: Dynamic popularity that increases when items are purchased recently, and decays over time if not purchased
-- **Saturation**: Dynamic measure that slightly increases when specific items are purchased
-
-Both hype and saturation evolve over time based on the agent's assortment decisions and customer purchases, this providing an endogenous multistage stochastic optimization problem.
-
-### Mathematical Formulation
-
-The dynamic assortment problem can be formulated as a finite-horizon Markov Decision Process (MDP) with the following components:
-
-**State Space** ``\mathcal{S}``: At time step ``t``, the state ``s_t`` consists of:
-```math
-s_t = (p, f, h_t, \sigma_t, t, \mathcal{H}_t)
-```
-where:
-- ``p \in \mathbb{R}^N`` are the fixed item prices
-- ``f \in \mathbb{R}^{d \times N}`` are the static item features
-- ``h_t \in \mathbb{R}^N`` are the current hype levels for each item
-- ``\sigma_t \in \mathbb{R}^N`` are the current saturation levels for each item
-- ``t \in \{1, 2, \ldots, T\}`` is the current time step
-- ``\mathcal{H}_t`` is the purchase history (last 5 purchases)
-
-**Action Space** ``\mathcal{A}``: The action at time ``t`` is an assortment selection:
-```math
-a_t \subseteq \{1, 2, \ldots, N\} \text{ such that } |a_t| = K
-```
-
-**Customer Choice Model**: Given assortment ``a_t``, customers choose according to a multinomial logit model:
-```math
-\forall i\in a_t,\, \mathbb{P}(i | a_t, s_t) = \frac{\exp(\theta_i(s_t))}{\sum_{j\in a_t} \exp(\theta_j(s_t)) + 1}
-```
-```math
-\mathbb{P}(\text{no purchase} | a_t, s_t) = \frac{1}{\sum_{j\in a_t} \exp(\theta_j(s_t)) + 1}
-```
-
-where ``\theta_i(s_t)`` is the utility of item ``i`` at state ``s_t``, computed by a hidden utility function:
-```math
-\theta_i(s_t) = \Phi(p_i, h_t^{(i)}, \sigma_t^{(i)}, f_{\cdot,i})
-```
-
-**Transition Dynamics** ``\mathcal{P}(s_{t+1} | s_t, a_t)``: After selecting assortment ``a_t`` and observing customer choice ``i^\star \sim \mathbb{P}(\cdot | a_t, s_t)``, the state evolves as:
-
-1. **Hype Update**: For each item ``i``, compute a hype multiplier based on recent purchase history:
-   ```math
-   m^{(i)} = 1 + \sum_{k=1}^{\min(5, |\mathcal{H}_t|)} \mathbf{1}_{i = \mathcal{H}_t[-k]} \cdot \alpha_k
-   ```
-   where ``\mathcal{H}_t[-k]`` is the ``k``-th most recent purchase, and the factors are:
-   ```math
-   \alpha_1 = 0.02, \quad \alpha_2 = \alpha_3 = \alpha_4 = \alpha_5 = -0.005
-   ```
-   Then update: ``h_{t+1}^{(i)} = h_t^{(i)} \times m^{(i)}``
-
-2. **Saturation Update**:
-   ```math
-   \sigma_{t+1}^{(i)} = \begin{cases}
-   \sigma_t^{(i)} \times 1.01 & \text{if } i = i^\star \\
-   \sigma_t^{(i)} & \text{otherwise}
-   \end{cases}
-   ```
-
-3. **History Update**: ``\mathcal{H}_{t+1} = \text{append}(\mathcal{H}_t, i^\star)`` (keeping last 5 purchases)
-
-**Reward Function** ``r(s_t, a_t, s_{t+1})``: The immediate reward is the revenue from the customer's purchase:
-```math
-r(s_t, a_t, s_{t+1}) = \begin{cases}
-p_{i^\star} & \text{if customer purchases item } i^\star \\
-0 & \text{if no purchase}
-\end{cases}
-```
-
-**Objective**: Find a policy ``\pi: \mathcal{S} \to \mathcal{A}`` that maximizes the expected cumulative reward:
-```math
-\max_\pi \mathbb{E}\left[\sum_{t=1}^T r(s_t, \pi(s_t), s_{t+1}) \right]
-```
-
-**Terminal Condition**: The episode terminates after ``T`` time steps, with no terminal reward.
-
-## Key Components
-
-### [`DynamicAssortmentBenchmark`](@ref)
-
-The main benchmark configuration with the following parameters:
-
-- `N`: Number of items in the catalog (default: 20)
-- `d`: Dimension of static feature vectors (default: 2) 
-- `K`: Assortment size constraint (default: 4)
-- `max_steps`: Number of time steps per episode (default: 80)
-- `customer_choice_model`: linear mapping from features to utilities
-- `exogenous`: Whether dynamics are exogenous (default: false)
-
-### Instance Generation
-
-Each problem instance includes:
-
-- **Prices**: Random values in [1, 10] for each item, plus 0 for no-purchase
-- **Features**: Random static features in [1, 10] for each item
-- **Initial State**: Random starting hype and saturation values in [1, 10]
-
-### Environment Dynamics
-
-The environment tracks:
-- Current time step
-- Purchase history (last 5 purchases)
-- Current hype and saturation for each item  
-- Customer utilities computed from current state
-
-**State Observation**: Agents observe a normalized feature vector containing:
-- Current full features (prices, hype, saturation, static features)
-- Change in hype/saturation from previous step
-- Change in hype/saturation from initial state  
-- Normalized current time step
-
-All features are divided by 10 for normalization.
-
-## Benchmark Policies
-
-### Expert Policy
-
-The expert policy computes the optimal assortment by brute-force enumeration:
-1. Enumerate all possible K-subsets of the N items
-2. For each subset, compute expected revenue using the choice model
-3. Return the subset with highest expected revenue
-
-This provides an optimal baseline but is computationally expensive.
-
-### Greedy Policy  
-
-The greedy policy selects the K items with the highest prices, ignoring dynamic effects and customer preferences. This provides a simple baseline.
-
-## Decision-Focused Learning Policy
-
-```math
-\xrightarrow[\text{State}]{s_t}
-\fbox{Neural network $\varphi_w$}
-\xrightarrow[\text{Cost vector}]{\theta}
-\fbox{Top K}
-\xrightarrow[\text{Assortment}]{a_t}
-```
-
-**Components**:
-
-1. **Neural Network** ``\varphi_w``: Takes the current state ``s_t`` as input and predicts item utilities ``\theta = (\theta_1, \ldots, \theta_N)``
-2. **Optimization Layer**: Selects the top ``K`` items with highest predicted utilities to form the assortment ``a_t``
-
-## Reference
-
-Based on the paper: [Structured Reinforcement Learning for Combinatorial Decision-Making](https://arxiv.org/abs/2505.19053)
diff --git a/docs/src/benchmarks/fixed_size_shortest_path.md b/docs/src/benchmarks/fixed_size_shortest_path.md
deleted file mode 100644
index 049724d..0000000
--- a/docs/src/benchmarks/fixed_size_shortest_path.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# Shortest paths
-
-[`FixedSizeShortestPathBenchmark`](@ref) is a benchmark problem that consists of finding the shortest path in a grid graph between the top left and bottom right corners.
-In this benchmark, the grid size is the same for all instances.
-
-!!! warning
-    Documentation for this benchmark is still under development. Please refer to the source code and API for more details.
\ No newline at end of file
diff --git a/docs/src/benchmarks/maintenance.md b/docs/src/benchmarks/maintenance.md
deleted file mode 100644
index 060099d..0000000
--- a/docs/src/benchmarks/maintenance.md
+++ /dev/null
@@ -1,107 +0,0 @@
-# Maintenance problem with resource constraint
-
-The Maintenance problem with resource constraint is a sequential decision-making benchmark where an agent must repeatedly decide which components to maintain over time. The goal is to minimize total expected cost while accounting for independent degradation of components and limited maintenance capacity.
-
-
-## Problem Description
-
-### Overview
-
-In this benchmark, a system consists of ``N`` identical components, each of which can degrade over ``n`` discrete states. State ``1`` means that the component is new, state $n$ means that the component is failed. At each time step, the agent can maintain up to $K$ components.  
-
-This forms an endogenous multistage stochastic optimization problem, where the agent must plan maintenance actions over the horizon.
-
-### Mathematical Formulation
-
-The maintenance problem can be formulated as a finite-horizon Markov Decision Process (MDP) with the following components:
-
-**State Space** ``\mathcal{S}``: At time step ``t``, the state ``s_t \in [1:n]^N`` is the degradation state for each component.
-
-**Action Space** ``\mathcal{A}``: The action at time ``t`` is the set of components that are maintained at time ``t``:
-```math
-a_t \subseteq \{1, 2, \ldots, N\} \text{ such that } |a_t| \leq K
-```
-### Transition Dynamics
-
-The state transitions depend on whether a component is maintained or not:
-
-For each component \(i\) at time \(t\):
-
-- **Maintained component** (\(i \in a_t\)):
-
-\[
-s_{t+1}^i = 1 \quad \text{(perfect maintenance)}
-\]
-
-- **Unmaintained component** (\(i \notin a_t\)):
-
-\[
-s_{t+1}^i =
-\begin{cases}
-\min(s_t^i + 1, n) & \text{with probability } p,\\
-s_t^i & \text{with probability } 1-p.
-\end{cases}
-\]
-
-Here, \(p\) is the degradation probability, \(s_t^i\) is the current state of component \(i\), and \(n\) is the maximum (failed) state.
-
----
-
-### Cost Function
-
-The immediate cost at time \(t\) is:
-
-```math
-c(s_t, a_t) = \Big( c_m \cdot |a_t| + c_f \cdot \#\{ i : s_t^i = n \} \Big)
-```
-
-Where:
-
-- $c_m$ is the maintenance cost per component.  
-- $|a_t|$ is the number of components maintained.  
-- $c_f$ is the failure cost per failed component.  
-- $\#\{ i : s_t^i = n \}$ counts the number of components in the failed state.
-
-This formulation captures the total cost for maintaining components and penalizing failures.
-
-**Objective**: Find a policy $\pi: \mathcal{S} \to \mathcal{A}$ that minimizes the expected cumulative cost:
-```math
-\min_\pi \mathbb{E}\left[\sum_{t=1}^T c(s_t, \pi(s_t)) \right]
-```
-
-**Terminal Condition**: The episode terminates after $T$ time steps, with no terminal reward.
-
-## Key Components
-
-### [`MaintenanceBenchmark`](@ref)
-
-The main benchmark configuration with the following parameters:
-
-- `N`: number of components (default: 2)
-- `K`: maximum number of components that can be maintained simultaneously (default: 1) 
-- `n`: number of degradation states per component (default: 3)
-- `p`: degradation probability (default: 0.2)
-- `c_f`: failure cost (default: 10.0)
-- `c_m`: maintenance cost (default: 3.0)
-- `max_steps`: Number of time steps per episode (default: 80)
-
-### Instance Generation
-
-Each problem instance includes:
-
-- **Starting State**: Random starting degradation state in $[1,n]$ for each components.
-
-### Environment Dynamics
-
-The environment tracks:
-- Current time step
-- Current degradation state.
-
-**State Observation**: Agents observe a normalized feature vector containing the degradation state of each component.
-
-## Benchmark Policies
-
-### Greedy Policy  
-
-Greedy policy that maintains components in the last two degradation states, up to the maintenance capacity. This provides a simple baseline.
-
diff --git a/docs/src/benchmarks/portfolio_optimization.md b/docs/src/benchmarks/portfolio_optimization.md
deleted file mode 100644
index da14f5a..0000000
--- a/docs/src/benchmarks/portfolio_optimization.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Portfolio Optimization
-
-[`PortfolioOptimizationBenchmark`](@ref) is a Markovitz portfolio optimization problem, where asset prices are unknown, and only contextual data is available to predict these prices.
-The goal is to predict asset prices $c$ and maximize the expected return of a portfolio, subject to a risk constraint using this maximization program:
-```math
-\begin{aligned}
-\max\quad & c^\top x\\
-\text{s.t.}\quad & x^\top \Sigma x \leq \gamma\\
-& 1^\top x \leq 1\\
-& x \geq 0
-\end{aligned}
-```
-
-!!! warning
-    Documentation for this benchmark is still under development. Please refer to the source code and API for more details.
\ No newline at end of file
diff --git a/docs/src/benchmarks/ranking.md b/docs/src/benchmarks/ranking.md
deleted file mode 100644
index b0069e4..0000000
--- a/docs/src/benchmarks/ranking.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Ranking
-
-!!! warning
-    Documentation for this benchmark is still under development. Please refer to the source code and API for more details.
\ No newline at end of file
diff --git a/docs/src/benchmarks/static/fixed_size_shortest_path.jl b/docs/src/benchmarks/static/fixed_size_shortest_path.jl
new file mode 100644
index 0000000..8a6779d
--- /dev/null
+++ b/docs/src/benchmarks/static/fixed_size_shortest_path.jl
@@ -0,0 +1,82 @@
+# # Shortest Path
+# Find the cheapest path from the top-left to the bottom-right of a grid graph:
+# edge costs are unknown and must be predicted from instance features.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = FixedSizeShortestPathBenchmark()
+
+# ## A training sample
+#
+# Each sample is a labeled triple `(x, θ, y)`:
+# - `x`: instance feature vector (observable at train and test time)
+# - `θ`: true edge costs (training supervision only, hidden at test time)
+# - `y`: path indicator vector (`y[e] = 1` if edge `e` is on the optimal path)
+#
+# True edge costs θ, averaged per vertex for display (hidden at test time — the model observes only `x`):
+dataset = generate_dataset(b, 50; seed=0)
+sample = first(dataset)
+plot_instance(b, sample)
+
+# Left: edge costs. Right: optimal path (white dots):
+plot_solution(b, sample)
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a statistical model predicting edge costs:
+model = generate_statistical_model(b)     # linear map: features → predicted edge costs
+# and a maximizer finding the shortest path given those costs:
+maximizer = generate_maximizer(b)         # Dijkstra shortest path on the grid graph
+
+# A randomly initialized policy predicts arbitrary costs, yielding a near-straight path:
+θ_pred = model(sample.x)
+plot_solution(b, DataSample(; sample.context..., x=sample.x, θ=θ_pred, y=maximizer(θ_pred)))
+
+# Optimality gap on the dataset (0 = optimal, higher is worse):
+compute_gap(b, dataset, model, maximizer)
+
+# ---
+# ## Problem Description
+#
+# A **fixed-size grid shortest path** problem. The graph is a directed acyclic grid of
+# size ``(\text{rows} \times \text{cols})``, with edges pointing right and downward.
+# Edge costs ``\theta \in \mathbb{R}^E`` are unknown; only a feature vector
+# ``x \in \mathbb{R}^p`` is observed. The task is to find the minimum-cost path from
+# vertex 1 (top-left) to vertex ``V`` (bottom-right):
+# ```math
+# y^* = \mathrm{argmin}_{y \in \mathcal{P}} \; \theta^\top y
+# ```
+# where ``y \in \{0,1\}^E`` indicates selected edges and ``\mathcal{P}`` is the set of
+# valid source-to-sink paths.
+#
+# Data is generated following the process in
+# [Mandi et al., 2023](https://arxiv.org/abs/2307.13565).
+#
+# ## Key Parameters
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `grid_size` | Grid dimensions `(rows, cols)` | `(5, 5)` |
+# | `p` | Feature dimension | 5 |
+# | `deg` | Polynomial degree for cost generation | 1 |
+# | `ν` | Multiplicative noise level (0 = no noise) | 0.0 |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x \in \mathbb{R}^p}
+# \fbox{Linear model}
+# \xrightarrow[\text{Predicted costs}]{\hat{\theta} \in \mathbb{R}^E}
+# \fbox{Dijkstra / Bellman-Ford}
+# \xrightarrow[\text{Path}]{y \in \{0,1\}^E}
+# ```
+#
+# **Model:** `Chain(Dense(p → E))` — predicts one cost per edge.
+#
+# **Maximizer:** Dijkstra (default) or Bellman-Ford on negated weights to find the
+# longest (maximum-weight) path.
+#
+# !!! note "Reference"
+#     Mandi et al. (2023), Decision-Focused Learning: Foundations, State of the Art, Benchmark and Future Opportunities.
+#     [arXiv:2307.13565](https://arxiv.org/abs/2307.13565)
diff --git a/docs/src/benchmarks/static/portfolio_optimization.jl b/docs/src/benchmarks/static/portfolio_optimization.jl
new file mode 100644
index 0000000..d7e7df0
--- /dev/null
+++ b/docs/src/benchmarks/static/portfolio_optimization.jl
@@ -0,0 +1,86 @@
+# # Portfolio Optimization
+# Allocate wealth across assets to maximize expected return subject to a risk constraint:
+# asset returns are unknown and must be predicted from contextual features.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = PortfolioOptimizationBenchmark()
+
+# ## A training sample
+#
+# Each sample is a labeled triple `(x, θ, y)`:
+# - `x`: contextual feature vector (observable at train and test time)
+# - `θ`: true expected asset returns (training supervision only, hidden at test time)
+# - `y`: optimal portfolio weights solving the Markowitz QP given `θ`
+#
+# True expected returns θ (hidden at test time — the model observes only the feature vector `x`):
+dataset = generate_dataset(b, 20; seed=0)
+sample = first(dataset)
+plot_instance(b, sample)
+
+# Left: true returns θ. Right: optimal portfolio weights y:
+plot_solution(b, sample)
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a statistical model predicting expected asset returns:
+model = generate_statistical_model(b)     # linear map: features → predicted returns
+# and a maximizer allocating the optimal portfolio given those returns:
+maximizer = generate_maximizer(b)         # Markowitz QP solver (Ipopt via JuMP)
+
+# A randomly initialized policy predicts arbitrary returns, leading to a suboptimal allocation:
+θ_pred = model(sample.x)
+plot_solution(b, DataSample(; sample.context..., x=sample.x, θ=θ_pred, y=maximizer(θ_pred)))
+
+# Optimality gap on the dataset (0 = optimal, higher is worse):
+compute_gap(b, dataset, model, maximizer)
+
+# ---
+# ## Problem Description
+#
+# A **Markowitz portfolio optimization** problem where asset expected returns are unknown.
+# Given contextual features ``x \in \mathbb{R}^p``, the learner predicts returns
+# ``\hat{\theta} \in \mathbb{R}^d`` and solves:
+#
+# ```math
+# \begin{aligned}
+# \max_{y} \quad & \hat{\theta}^\top y \\
+# \text{s.t.} \quad & y^\top \Sigma y \leq \gamma \\
+# & \mathbf{1}^\top y \leq 1 \\
+# & y \geq 0
+# \end{aligned}
+# ```
+#
+# where ``\Sigma`` is the asset covariance matrix and ``\gamma`` is the risk budget.
+# The solver uses [Ipopt.jl](https://github.com/jump-dev/Ipopt.jl) via JuMP.
+#
+# ## Key Parameters
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `d` | Number of assets | 50 |
+# | `p` | Feature dimension | 5 |
+# | `deg` | Polynomial degree for data generation | 1 |
+# | `ν` | Noise hyperparameter | 1.0 |
+#
+# Data is generated following the process in
+# [Mandi et al., 2023](https://arxiv.org/abs/2307.13565).
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x \in \mathbb{R}^p}
+# \fbox{Linear model}
+# \xrightarrow[\text{Predicted returns}]{\hat{\theta} \in \mathbb{R}^d}
+# \fbox{QP solver (Ipopt)}
+# \xrightarrow[\text{Portfolio}]{y \in \mathbb{R}^d}
+# ```
+#
+# **Model:** `Dense(p → d)` — predicts one expected return per asset.
+#
+# **Maximizer:** Ipopt QP solver enforcing the variance and budget constraints.
+#
+# !!! note "Reference"
+#     Mandi et al. (2023), Decision-Focused Learning: Foundations, State of the Art, Benchmark and Future Opportunities.
+#     [arXiv:2307.13565](https://arxiv.org/abs/2307.13565)
diff --git a/docs/src/benchmarks/static/ranking.jl b/docs/src/benchmarks/static/ranking.jl
new file mode 100644
index 0000000..330785e
--- /dev/null
+++ b/docs/src/benchmarks/static/ranking.jl
@@ -0,0 +1,73 @@
+# # Ranking
+# Rank a set of items by predicted cost: the model must learn to sort items by their
+# hidden scores from observable features alone.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = RankingBenchmark()
+
+# ## A training sample
+#
+# Each sample is a labeled triple `(x, θ, y)`:
+# - `x`: feature matrix (rows = features, columns = items; observable at train and test time)
+# - `θ`: true item costs (training supervision only, hidden at test time)
+# - `y`: ordinal ranks derived from `θ` (`y[i] = 1` means item `i` has the highest cost)
+#
+# True costs θ (hidden at test time — the model observes only the feature matrix `x`):
+dataset = generate_dataset(b, 50; seed=0)
+sample = first(dataset)
+plot_instance(b, sample)
+
+# The same costs, colored by rank (dark blue = best, light = worst):
+plot_solution(b, sample)
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a statistical model predicting item scores:
+model = generate_statistical_model(b)     # linear map: features → predicted costs
+# and a maximizer ranking items by those scores:
+maximizer = generate_maximizer(b)         # ordinal ranking via sortperm
+
+# A randomly initialized policy produces an arbitrary ranking:
+θ_pred = model(sample.x)
+plot_solution(b, DataSample(; sample.context..., x=sample.x, θ=θ_pred, y=maximizer(θ_pred)))
+
+# Optimality gap on the dataset (0 = optimal, higher is worse):
+compute_gap(b, dataset, model, maximizer)
+
+# ---
+# ## Problem Description
+#
+# In the **Ranking benchmark**, a feature matrix ``x \in \mathbb{R}^{p \times n}`` is
+# observed. A hidden linear encoder maps ``x`` to a cost vector
+# ``\theta \in \mathbb{R}^n``. The task is to compute the ordinal ranking of the items
+# by cost:
+# ```math
+# y_i = \mathrm{rank}(\theta_i \mid \theta_1, \ldots, \theta_n)
+# ```
+# where ``y_i = 1`` means item ``i`` has the highest cost.
+#
+# ## Key Parameters
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `instance_dim` | Number of items to rank | 10 |
+# | `nb_features` | Feature dimension `p` | 5 |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x}
+# \fbox{Linear model}
+# \xrightarrow{\hat{\theta}}
+# \fbox{ranking}
+# \xrightarrow{y}
+# ```
+#
+# **Model:** `Chain(Dense(nb_features → 1; bias=false), vec)` — predicts one score per item.
+#
+# **Maximizer:** `ranking(θ)` — returns a vector of ordinal ranks via `invperm(sortperm(θ))`.
+#
+# !!! note "Reference"
+#     TODO: add original reference.
diff --git a/docs/src/benchmarks/static/subset_selection.jl b/docs/src/benchmarks/static/subset_selection.jl
new file mode 100644
index 0000000..4edeba8
--- /dev/null
+++ b/docs/src/benchmarks/static/subset_selection.jl
@@ -0,0 +1,77 @@
+# # Subset Selection
+# Select the `k` most valuable items from a set of `n`: items with unknown values
+# must be identified from observable features alone.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = SubsetSelectionBenchmark()
+
+# ## A training sample
+#
+# Each sample is a labeled triple `(x, θ, y)`:
+# - `x`: item feature vector (observable at train and test time)
+# - `θ`: true item values (equal to `x` by default; otherwise derived via a hidden encoder)
+# - `y`: selection indicator (`y[i] = 1` for the `k` highest-value items, 0 otherwise)
+#
+# True item values θ (hidden at test time — the model observes only the feature vector `x`):
+dataset = generate_dataset(b, 50; seed=0)
+sample = first(dataset)
+plot_instance(b, sample)
+
+# The same values, with the `k` selected items highlighted in green:
+plot_solution(b, sample)
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a statistical model predicting item scores:
+model = generate_statistical_model(b)     # linear map: features → predicted item scores
+# and a maximizer selecting the top-k items by those scores:
+maximizer = generate_maximizer(b)         # top-k selection
+
+# A randomly initialized policy selects items with no relation to their true values:
+θ_pred = model(sample.x)
+plot_solution(b, DataSample(; sample.context..., x=sample.x, θ=θ_pred, y=maximizer(θ_pred)))
+
+# Optimality gap on the dataset (0 = optimal, higher is worse):
+compute_gap(b, dataset, model, maximizer)
+
+# ---
+# ## Problem Description
+#
+# In the **Subset Selection benchmark**, ``n`` items have unknown values ``\theta_i``.
+# A feature vector ``x \in \mathbb{R}^n`` is observed (identity mapping by default).
+# The task is to select the ``k`` items with the highest values:
+# ```math
+# y = \mathrm{top}_k(\theta)
+# ```
+# where ``y \in \{0,1\}^n`` with exactly ``k`` ones.
+#
+# ## Key Parameters
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `n` | Total number of items | 25 |
+# | `k` | Number of items to select | 5 |
+# | `identity_mapping` | Use identity as the hidden mapping | `true` |
+#
+# When `identity_mapping=true`, features equal item values directly (`x = θ`).
+# When `false`, a random linear layer is used as the hidden mapping.
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x}
+# \fbox{Linear model}
+# \xrightarrow{\hat{\theta}}
+# \fbox{top-k}
+# \xrightarrow{y}
+# ```
+#
+# **Model:** `Dense(n → n; bias=false)` — predicts a score per item.
+#
+# **Maximizer:** `top_k(θ, k)` — returns a boolean vector with `true` at the `k`
+# highest-scoring positions.
+#
+# !!! note "Reference"
+#     TODO: add original reference.
diff --git a/docs/src/benchmarks/static/warcraft.jl b/docs/src/benchmarks/static/warcraft.jl
new file mode 100644
index 0000000..dc98354
--- /dev/null
+++ b/docs/src/benchmarks/static/warcraft.jl
@@ -0,0 +1,89 @@
+# # Warcraft
+# Find the cheapest path on a 12×12 terrain map: cell travel costs are unknown and must
+# be inferred from the RGB terrain image using a neural network.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = WarcraftBenchmark()
+
+# ## Observable input
+#
+# At inference time the decision-maker observes only the terrain image `x` (not the costs `θ`):
+sample = generate_dataset(b, 1)[1]
+plot_instance(b, sample)
+
+# ## A training sample
+#
+# Each sample is a labeled triple `(x, θ, y)`:
+# - `x`: terrain image (12×12×3 RGB array; observable at train and test time)
+# - `θ`: true cell travel costs (training supervision only, hidden at test time)
+# - `y`: optimal path indicator (`y[i,j] = 1` if cell `(i,j)` is on the path)
+#
+# Left: terrain image. Middle: true costs θ. Right: optimal path y:
+plot_solution(b, sample)
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a CNN predicting cell travel costs from the terrain image:
+model = generate_statistical_model(b)     # ResNet18 CNN: terrain image → 12×12 cost map
+# and a maximizer finding the shortest path given those costs:
+maximizer = generate_maximizer(b)         # Dijkstra shortest path on the 12×12 grid
+
+# An untrained CNN produces a near-uniform cost map, yielding a near-straight path:
+θ_pred = model(sample.x)
+plot_solution(b, DataSample(; sample.context..., x=sample.x, θ=θ_pred, y=maximizer(θ_pred)))
+
+# Optimality gap on this sample (0 = optimal, higher is worse):
+compute_gap(b, [sample], model, maximizer)
+
+# ---
+# ## Problem Description
+#
+# In the **Warcraft benchmark**, each instance is a 12×12 grid representing a Warcraft
+# terrain map. Each cell has an unknown travel cost depending on its terrain type (forest,
+# mountain, water, etc.). The task is to find the path from the top-left to the
+# bottom-right corner that minimizes total travel cost.
+#
+# Formally, let ``\theta_{ij}`` be the (unknown) cost of cell ``(i,j)`` and
+# ``y_{ij} \in \{0,1\}`` indicate whether cell ``(i,j)`` is on the path. The objective is:
+# ```math
+# y^* = \mathrm{argmin}_{y \in \mathcal{P}} \sum_{(i,j)} \theta_{ij} \, y_{ij}
+# ```
+# where ``\mathcal{P}`` is the set of valid grid paths (4-connected, source to sink).
+#
+# The dataset contains 10 000 labeled terrain images from the Warcraft II tileset.
+# It is downloaded automatically on first use via
+# [DataDeps.jl](https://github.com/oxinabox/DataDeps.jl).
+#
+# ## Key Components
+#
+# **[`WarcraftBenchmark`](@ref)** has no parameters.
+#
+# | Method | Description |
+# |--------|-------------|
+# | `generate_dataset(b, n)` | Downloads and loads `n` terrain images with true costs and paths |
+# | `generate_statistical_model(b)` | ResNet18 CNN (first 5 layers + adaptive maxpool + neg) |
+# | `generate_maximizer(b; dijkstra=true)` | Dijkstra or Bellman-Ford shortest path |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Terrain image}]{x \in \mathbb{R}^{12 \times 12 \times 3}}
+# \fbox{ResNet18 CNN}
+# \xrightarrow[\text{Cell costs}]{\hat{\theta} \in \mathbb{R}^{12 \times 12}}
+# \fbox{Dijkstra}
+# \xrightarrow[\text{Path}]{y \in \{0,1\}^{12 \times 12}}
+# ```
+#
+# The CNN maps terrain pixel values to predicted cell costs, which are then passed to a
+# shortest-path solver. Training end-to-end with
+# [InferOpt.jl](https://github.com/JuliaDecisionFocusedLearning/InferOpt.jl) teaches
+# the network to produce costs that lead to good paths, not just accurate cost estimates.
+#
+# !!! tip
+#     See the [Warcraft tutorial](../../warcraft_tutorial.md) for a complete end-to-end training
+#     example using `PerturbedMultiplicative` and `FenchelYoungLoss`.
+#
+# !!! note "Reference"
+#     Vlastelica et al. (2020), Differentiation of Blackbox Combinatorial Solvers, ICLR 2020.
diff --git a/docs/src/benchmarks/stochastic/vsp.jl b/docs/src/benchmarks/stochastic/vsp.jl
new file mode 100644
index 0000000..d132822
--- /dev/null
+++ b/docs/src/benchmarks/stochastic/vsp.jl
@@ -0,0 +1,118 @@
+# # Stochastic Vehicle Scheduling
+# Assign vehicles to cover a set of tasks while minimizing costs under stochastic delays:
+# the DFL agent learns to predict adjusted costs that implicitly hedge against uncertainty.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = StochasticVehicleSchedulingBenchmark()
+
+# ## A sample instance
+#
+# Each instance is a city with task locations and scheduled times.
+# `store_city=true` is required to visualize the map (not needed for training):
+sample = generate_dataset(b, 1; store_city=true)[1]
+plot_instance(b, sample)
+
+# ## Untrained policy
+#
+# Each edge `(u, v)` has a 20-dimensional feature vector encoding schedule slack, travel
+# times, and timing — this is what the model receives as `x` per edge:
+# A DFL policy chains two components: a statistical model predicting adjusted edge costs:
+model = generate_statistical_model(b)     # linear map: task features → adjusted edge costs
+# and a maximizer solving the deterministic VSP given those costs:
+maximizer = generate_maximizer(b)         # deterministic VSP solver (HiGHS MIP)
+
+# The untrained model predicts random edge costs; the resulting schedule is arbitrary.
+# Run the solver on predicted costs to see a route visualization:
+θ_pred = model(sample.x)
+y_pred = maximizer(θ_pred; sample.context...)
+plot_solution(
+    b, DataSample(; sample.context..., x=sample.x, θ=θ_pred, y=y_pred, extra=sample.extra)
+)
+
+# ---
+# ## Problem Description
+#
+# ### Overview
+#
+# In the **Vehicle Scheduling Problem (VSP)**, we consider a set of tasks ``V``. Each
+# task ``v \in V`` has a scheduled beginning time ``t_v^b`` and end time ``t_v^e``, with
+# ``t_v^e > t_v^b``. We denote ``t^{tr}_{(u,v)}`` the travel time from task ``u`` to task
+# ``v``. A task ``v`` can follow ``u`` only if:
+# ```math
+# t_v^b \geq t_u^e + t^{tr}_{(u,v)}
+# ```
+#
+# An instance of VSP can be modeled as an acyclic directed graph where nodes are tasks
+# and edges represent feasible successions. A solution is a set of disjoint paths such
+# that all tasks are fulfilled exactly once to minimize total costs.
+#
+# In the **Stochastic VSP (StoVSP)**, after the scheduling decision is set, random delays
+# propagate along vehicle tours. The objective becomes minimizing base costs plus expected
+# total delay costs over scenarios.
+#
+# ### Mathematical Formulation
+#
+# **Variables:** Let ``y_{u,v} \in \{0,1\}`` indicate if a vehicle performs task ``v``
+# immediately after task ``u``.
+#
+# **Delay Propagation:** For each task ``v`` in scenario ``s``:
+# - ``\gamma_v^s``: intrinsic delay of task ``v``
+# - ``d_v^s``: total accumulated delay
+# - ``\delta_{u,v}^s = t_v^b - (t_u^e + t^{tr}_{(u,v)})``: slack time
+#
+# ```math
+# d_v^s = \gamma_v^s + \max(d_u^s - \delta_{u,v}^s,\; 0)
+# ```
+#
+# **Objective:**
+# ```math
+# \min_{y} \; \sum_{(u,v)} c_{u,v} \, y_{u,v} + \mathbb{E}_{s \in S}\!\left[\sum_v C_d \, d_v^s\right]
+# ```
+#
+# ## Key Components
+#
+# ### [`StochasticVehicleSchedulingBenchmark`](@ref)
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `nb_tasks` | Number of tasks per instance | 25 |
+# | `nb_scenarios` | Number of scenarios for objective evaluation | 10 |
+#
+# ### Instance Generation
+#
+# Each instance simulates a geographic city with depots and task locations. Tasks have
+# realistic scheduled start/end times. Scenarios are random intrinsic delays ``\gamma``
+# drawn from a Log-Normal distribution. Feature vectors are 20-dimensional.
+#
+# ## Baseline Policies
+#
+# | Policy | Description |
+# |--------|-------------|
+# | `svs_deterministic_policy` | Solves the deterministic VSP, ignoring delays |
+# | `svs_saa_policy` | SAA via column generation over ``K`` scenarios |
+# | `svs_saa_mip_policy` | Exact SAA via compact MIP formulation |
+# | `svs_local_search_policy` | Heuristic local search over sampled scenarios |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x \in \mathbb{R}^{20}}
+# \fbox{Linear model $\varphi_w$}
+# \xrightarrow[\text{Predicted cost}]{\hat{c}}
+# \fbox{Deterministic VSP solver}
+# \xrightarrow[\text{Routes}]{y}
+# ```
+#
+# By training end-to-end with the deterministic solver, the linear model learns adjusted
+# costs ``\hat{c}`` that implicitly account for expected stochastic delays, while keeping
+# the fast deterministic solver at inference time.
+#
+# **Model:** `Chain(Dense(20 → 1; bias=false), vec)` — predicts one adjusted cost per edge.
+#
+# **Maximizer:** `StochasticVehicleSchedulingMaximizer` — HiGHS MIP solver on the
+# deterministic VSP instance.
+#
+# !!! note "Reference"
+#     TODO: add original reference.
diff --git a/docs/src/benchmarks/subset_selection.md b/docs/src/benchmarks/subset_selection.md
deleted file mode 100644
index 918e424..0000000
--- a/docs/src/benchmarks/subset_selection.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Subset Selection
-
-[`SubsetSelectionBenchmark`](@ref) is the most trivial benchmark problem in this package.
-It is minimalistic and serves as a simple example for debugging and testing purposes.
-
-## Description
-We have a set of ``n`` items, each item having an unknown value.
-We want to select a subset of ``k`` items that maximizes the sum of the values of the selected items.
-
-As input, instead of the items costs, we are given a feature vector, such that an unknown linear mapping between the feature vector and the value of the items exists.
-
-By default, this linear mapping is the identity mapping, i.e., the value of each item is equal to the value of the corresponding feature vector element.
-However, this mapping can be changed by setting the `identity_mapping` parameter to `false`.
diff --git a/docs/src/benchmarks/toy/argmax.jl b/docs/src/benchmarks/toy/argmax.jl
new file mode 100644
index 0000000..aaed6cd
--- /dev/null
+++ b/docs/src/benchmarks/toy/argmax.jl
@@ -0,0 +1,84 @@
+# # Argmax
+# Select the single best item from a set of `n` items. Item scores are **hidden**, 
+# only a feature matrix `x` correlated with these scores is observable.
+# This problem can also be seen as a multiclass classification problem where
+# we use an argmax layer instead of a softmax. This is not very useful in practice, it's more a
+# minimalist toy problem to showcase DFL concepts in the simplest possible setting.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+using Statistics
+
+b = ArgmaxBenchmark(; instance_dim=10, nb_features=5, seed=0)
+
+# ## Observable input
+#
+# At inference time the decision-maker observes only a feature matrix `x`
+# (rows = features, columns = items):
+dataset = generate_dataset(b, 100; seed=0)
+sample = first(dataset)
+plot_instance(b, sample)
+
+# ## A training sample
+#
+# Each sample is a labeled triple `(x, θ, y)`:
+# - `x`: feature matrix (observable at train and test time)
+# - `θ`: true item scores (training supervision only, hidden at test time)
+# - `y`: optimal one-hot decision derived from `θ`
+#
+# The full training triple (features, true scores, and optimal decision):
+plot_solution(b, sample)
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a statistical model predicting scores from features:
+model = generate_statistical_model(b)     # linear map: features → predicted scores
+# and a maximizer turning those scores into a decision:
+maximizer = generate_maximizer(b)         # one-hot argmax
+
+# A randomly initialized policy makes essentially random decisions:
+θ_pred = model(sample.x)
+y_pred = maximizer(θ_pred)
+#
+plot_solution(b, DataSample(; x=sample.x, θ=θ_pred, y=y_pred, sample.context...))
+
+# The goal of training is to find parameters that maximize accuracy.
+# Current accuracy on the dataset:
+mean(maximizer(model(s.x)) == s.y for s in dataset)
+
+# ---
+# ## Problem Description
+#
+# In the **Argmax benchmark**, a feature matrix ``x \in \mathbb{R}^{p \times n}`` is
+# observed. A hidden linear encoder maps ``x`` to a score vector
+# ``\theta = \text{encoder}(x) \in \mathbb{R}^n``. The task is to select the item with
+# the highest score:
+# ```math
+# y = \mathrm{argmax}(\theta)
+# ```
+# The solution ``y`` is encoded as a one-hot vector.
+# The score vector ``\theta`` is never observed (only features ``x`` are available).
+# The DFL pipeline trains a model ``f_w`` so that ``\mathrm{argmax}(f_w(x))`` matches
+# ``\mathrm{argmax}(\theta)`` at decision time.
+#
+# ## Key Parameters
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `instance_dim` | Number of items | 10 |
+# | `nb_features` | Feature dimension `p` | 5 |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x \in \mathbb{R}^{p \times n}}
+# \fbox{Linear model $f_w$}
+# \xrightarrow[\text{Predicted scores}]{\hat{\theta} \in \mathbb{R}^n}
+# \fbox{argmax}
+# \xrightarrow[\text{Selection}]{y \in \{0,1\}^n}
+# ```
+#
+# **Model:** `Chain(Dense(nb_features → 1; bias=false), vec)`: a single linear layer
+# predicting one score per item.
+#
+# **Maximizer:** `one_hot_argmax`: returns a one-hot vector at the argmax index.
diff --git a/docs/src/benchmarks/toy/argmax2d.jl b/docs/src/benchmarks/toy/argmax2d.jl
new file mode 100644
index 0000000..0ef7649
--- /dev/null
+++ b/docs/src/benchmarks/toy/argmax2d.jl
@@ -0,0 +1,79 @@
+# # Argmax on a 2D polytope
+# Select the best vertex of a random 2D polytope: predict a 2D cost vector from features,
+# then return the vertex maximizing the dot product with it.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = Argmax2DBenchmark(; seed=0)
+
+# ## Observable input
+#
+# At inference time the decision-maker observes the feature vector `x` and the polytope shape,
+# but not the cost direction hidden `θ`:
+dataset = generate_dataset(b, 50; seed=0)
+sample = first(dataset)
+plot_instance(b, sample)
+
+# ## A training sample
+#
+# Each sample is a labeled triple `(x, θ, y)`:
+# - `x`: feature vector (observable at train and test time)
+# - `θ`: 2D cost direction (training supervision only, hidden at test time)
+# - `y`: polytope vertex maximizing `θᵀv` (optimal decision)
+# - `instance` (in `context`): polytope vertices (observable problem structure)
+#
+# The full training triple (polytope, cost direction θ, optimal vertex y):
+plot_solution(b, sample)
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a statistical model predicting a 2D cost direction:
+model = generate_statistical_model(b)     # linear map: features → 2D cost vector
+# and a maximizer selecting the best polytope vertex for that direction:
+maximizer = generate_maximizer(b)         # vertex maximizing θᵀv over polytope vertices
+
+# A randomly initialized policy predicts an arbitrary cost direction:
+θ_pred = model(sample.x)
+plot_solution(
+    b,
+    DataSample(;
+        sample.context..., x=sample.x, θ=θ_pred, y=maximizer(θ_pred; sample.context...)
+    ),
+)
+
+# ---
+# ## Problem Description
+#
+# In the **Argmax2D benchmark**, each instance defines a random convex polytope
+# ``\mathcal{Y}(x) = \mathrm{conv}(v_1, \ldots, v_m)`` in ``\mathbb{R}^2``.
+# A hidden encoder maps features ``x \in \mathbb{R}^p`` to a 2D cost vector
+# ``\theta \in \mathbb{R}^2``. The task is to find the polytope vertex maximizing
+# the dot product:
+# ```math
+# y^* = \mathrm{argmax}_{v \in \mathcal{Y}(x)} \; \theta^\top v
+# ```
+#
+# This is a toy 2D combinatorial optimization problem useful for visualizing
+# how well a model learns the cost direction.
+#
+# ## Key Parameters
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `nb_features` | Feature dimension `p` | 5 |
+# | `polytope_vertex_range` | Number of polytope vertices (list; one value drawn at random per instance) | `[6]` |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x}
+# \fbox{Linear model}
+# \xrightarrow{\hat{\theta} \in \mathbb{R}^2}
+# \fbox{Polytope argmax}
+# \xrightarrow{y}
+# ```
+#
+# **Model:** `Dense(nb_features → 2; bias=false)` — predicts a 2D cost direction.
+#
+# **Maximizer:** finds the vertex of the instance polytope with maximum dot product with θ.
diff --git a/docs/src/benchmarks/toy/contextual_stochastic_argmax.jl b/docs/src/benchmarks/toy/contextual_stochastic_argmax.jl
new file mode 100644
index 0000000..293616c
--- /dev/null
+++ b/docs/src/benchmarks/toy/contextual_stochastic_argmax.jl
@@ -0,0 +1,103 @@
+# # Contextual Stochastic Argmax
+# Select the best item when utilities are random but correlated with observable context:
+# a linear model must learn the mapping from context to expected utilities.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = ContextualStochasticArgmaxBenchmark()
+
+# Stochastic benchmarks need a labeling policy to generate training targets.
+# We use the anticipative oracle: given realized scenario ξ it returns the best item.
+anticipative = generate_anticipative_solver(b)
+policy =
+    (ctx, scenarios) -> [
+        DataSample(; ctx.context..., x=ctx.x, y=anticipative(ξ), extra=(; scenario=ξ))
+        for ξ in scenarios
+    ]
+dataset = generate_dataset(b, 20; target_policy=policy, seed=0)
+sample = first(dataset)
+
+# ## Observable input
+#
+# At inference time `c_base` and `x_raw` are known (not the realized utility vector ξ).
+# `plot_instance` shows the base utilities `c_base`:
+plot_instance(b, sample)
+
+# ## A training sample
+#
+# Stochastic benchmarks have no single ground-truth label: the optimal item depends on
+# which utility scenario is realized. We label each sample with the anticipative oracle,
+# which returns the best item given the realized scenario ξ.
+#
+# Each labeled sample contains:
+# - `x`: feature vector `[c_base; x_raw]` (observable at train and test time)
+# - `y`: optimal item for the realized scenario ξ (one-hot; anticipative oracle label)
+# - `extra.scenario`: realized utility vector ξ (available only during training)
+#
+# Left: realized scenario ξ. Right: selected item (red):
+plot_solution(b, sample)
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a statistical model predicting expected item utilities:
+model = generate_statistical_model(b)     # linear map: features → predicted expected utilities
+# and a maximizer selecting the item with the highest predicted utility:
+maximizer = generate_maximizer(b)         # one-hot argmax
+
+# A randomly initialized policy selects items with no relation to their expected utilities.
+# Left: predicted utilities θ̂. Right: selected item (red):
+θ_pred = model(sample.x)
+plot_solution(b, DataSample(; sample.context..., x=sample.x, θ=θ_pred, y=maximizer(θ_pred)))
+
+# ---
+# ## Problem Description
+#
+# ### Overview
+#
+# In the **Contextual Stochastic Argmax benchmark**, ``n`` items have random utilities
+# that depend on observable context. Per instance:
+# - ``c_\text{base} \sim U[0,1]^n``: base utilities (stored in `context`)
+# - ``x_\text{raw} \sim \mathcal{N}(0, I_d)``: observable context features
+# - Full features: ``x = [c_\text{base}; x_\text{raw}] \in \mathbb{R}^{n+d}``
+#
+# The realized utility (scenario) is drawn as:
+# ```math
+# \xi = c_\text{base} + W \, x_\text{raw} + \varepsilon, \quad \varepsilon \sim \mathcal{N}(0, \sigma^2 I)
+# ```
+# where ``W \in \mathbb{R}^{n \times d}`` is a fixed unknown perturbation matrix.
+#
+# The task is to select the item with the highest realized utility:
+# ```math
+# y^* = \mathrm{argmax}(\xi)
+# ```
+#
+# A linear model ``\hat{\theta} = [I \mid W] \cdot x`` can exactly recover the optimal
+# solution in expectation.
+#
+# ## Key Parameters
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `n` | Number of items | 10 |
+# | `d` | Context feature dimension | 5 |
+# | `noise_std` | Noise standard deviation σ | 0.1 |
+#
+# ## Baseline Policies
+#
+# - **SAA**: selects the item with highest mean utility over available scenarios.
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x = [c_\text{base}; x_\text{raw}]}
+# \fbox{Linear model}
+# \xrightarrow{\hat{\theta} \in \mathbb{R}^n}
+# \fbox{argmax}
+# \xrightarrow{y}
+# ```
+#
+# **Model:** `Dense(n+d → n; bias=false)` — can in principle recover the exact mapping
+# ``[I \mid W]`` from training data.
+#
+# **Maximizer:** `one_hot_argmax`.
diff --git a/docs/src/benchmarks/vsp.md b/docs/src/benchmarks/vsp.md
deleted file mode 100644
index adcb772..0000000
--- a/docs/src/benchmarks/vsp.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Stochastic Vehicle Scheduling
-
-[`StochasticVehicleSchedulingBenchmark`](@ref).
-
-!!! warning
-    Documentation for this benchmark is still under development. Please refer to the source code and API for more details.
diff --git a/docs/src/benchmarks/warcraft.md b/docs/src/benchmarks/warcraft.md
deleted file mode 100644
index c78850e..0000000
--- a/docs/src/benchmarks/warcraft.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Warcraft
-
-See the tutorial for a full demo of [`WarcraftBenchmark`](@ref).
diff --git a/ext/DFLBenchmarksPlotsExt.jl b/ext/DFLBenchmarksPlotsExt.jl
index 0a5caae..c7e47b9 100644
--- a/ext/DFLBenchmarksPlotsExt.jl
+++ b/ext/DFLBenchmarksPlotsExt.jl
@@ -7,10 +7,18 @@ using Plots
 import DecisionFocusedLearningBenchmarks:
     has_visualization, plot_instance, plot_solution, plot_trajectory, animate_trajectory
 
+include("plots/argmax_plots.jl")
 include("plots/argmax2d_plots.jl")
+include("plots/ranking_plots.jl")
+include("plots/subset_selection_plots.jl")
+include("plots/portfolio_plots.jl")
+include("plots/shortest_path_plots.jl")
+include("plots/contextual_stochastic_argmax_plots.jl")
 include("plots/warcraft_plots.jl")
 include("plots/svs_plots.jl")
 include("plots/dvs_plots.jl")
+include("plots/dynamic_assortment_plots.jl")
+include("plots/maintenance_plots.jl")
 
 """
     plot_solution(bench::AbstractBenchmark, sample::DataSample, y; kwargs...)
diff --git a/ext/plots/argmax_plots.jl b/ext/plots/argmax_plots.jl
new file mode 100644
index 0000000..7886aad
--- /dev/null
+++ b/ext/plots/argmax_plots.jl
@@ -0,0 +1,58 @@
+has_visualization(::ArgmaxBenchmark) = true
+
+"""
+$TYPEDSIGNATURES
+
+Plot the input features as a heatmap. Columns correspond to items, rows correspond to features.
+"""
+function plot_instance(::ArgmaxBenchmark, sample::DataSample; kwargs...)
+    x = sample.x  # nb_features × n
+    n = size(x, 2)
+    return Plots.heatmap(
+        x;
+        xlabel="Item",
+        ylabel="Feature",
+        title="Features x (observable input)",
+        xticks=1:n,
+        kwargs...,
+    )
+end
+
+"""
+$TYPEDSIGNATURES
+
+Plot the features `x`, scores `θ`, and decision `y` in `sample` as heatmaps.
+All three share the same item axis (columns).
+"""
+function plot_solution(::ArgmaxBenchmark, sample::DataSample; kwargs...)
+    x = sample.x  # nb_features × n
+    θ = sample.θ  # length n
+    y = sample.y  # one-hot, length n
+    n = length(θ)
+
+    p1 = Plots.heatmap(
+        x; ylabel="Feature", title="x (features, observable)", xticks=(1:n, fill("", n))
+    )
+    θ_min, θ_max = extrema(θ)
+    p2 = Plots.heatmap(
+        reshape(Float64.(θ), 1, n);
+        ylabel="θ",
+        title="θ: scores [$(round(θ_min; sigdigits=2)), $(round(θ_max; sigdigits=2))]",
+        yticks=false,
+        xticks=(1:n, fill("", n)),
+        colorbar=false,
+    )
+    p3 = Plots.heatmap(
+        reshape(Float64.(y), 1, n);
+        xlabel="Item",
+        ylabel="y",
+        title="y (decision, one-hot)",
+        yticks=false,
+        xticks=1:n,
+        color=:Greens,
+        colorbar=false,
+    )
+
+    l = Plots.@layout [a{0.65h}; b{0.175h}; c{0.175h}]
+    return Plots.plot(p1, p2, p3; layout=l, size=(600, 420), kwargs...)
+end
diff --git a/ext/plots/contextual_stochastic_argmax_plots.jl b/ext/plots/contextual_stochastic_argmax_plots.jl
new file mode 100644
index 0000000..601f941
--- /dev/null
+++ b/ext/plots/contextual_stochastic_argmax_plots.jl
@@ -0,0 +1,56 @@
+has_visualization(::ContextualStochasticArgmaxBenchmark) = true
+
+function plot_instance(::ContextualStochasticArgmaxBenchmark, sample::DataSample; kwargs...)
+    c_base = sample.c_base  # base utilities from context
+    n = length(c_base)
+    return Plots.bar(
+        1:n,
+        c_base;
+        legend=false,
+        xlabel="Item",
+        ylabel="Base utility",
+        title="Instance (base utilities c_base)",
+        color=:steelblue,
+        kwargs...,
+    )
+end
+
+function plot_solution(::ContextualStochasticArgmaxBenchmark, sample::DataSample; kwargs...)
+    y = sample.y  # one-hot vector
+    n = length(y)
+
+    # Pick the best available utility vector to display
+    if hasproperty(sample.extra, :scenario)
+        u = sample.extra.scenario
+        u_title = "Realized scenario ξ"
+    elseif hasproperty(sample, :θ) && !isnothing(sample.θ)
+        u = sample.θ
+        u_title = "Predicted utilities θ̂"
+    else
+        u = sample.c_base
+        u_title = "Base utilities c_base"
+    end
+
+    p1 = Plots.bar(
+        1:n,
+        u;
+        legend=false,
+        xlabel="Item",
+        ylabel="Utility",
+        title=u_title,
+        color=:steelblue,
+    )
+
+    colors = [y[i] > 0 ? :firebrick : :steelblue for i in 1:n]
+    p2 = Plots.bar(
+        1:n,
+        u;
+        color=colors,
+        legend=false,
+        xlabel="Item",
+        ylabel="Utility",
+        title="Selected item (red)",
+    )
+
+    return Plots.plot(p1, p2; layout=(1, 2), size=(800, 300), kwargs...)
+end
diff --git a/ext/plots/dynamic_assortment_plots.jl b/ext/plots/dynamic_assortment_plots.jl
new file mode 100644
index 0000000..dff56ec
--- /dev/null
+++ b/ext/plots/dynamic_assortment_plots.jl
@@ -0,0 +1,50 @@
+has_visualization(::DynamicAssortmentBenchmark) = true
+
+function plot_instance(::DynamicAssortmentBenchmark, sample::DataSample; kwargs...)
+    # sample.instance = (env.features, purchase_history); row 1 of features = prices (×10 to undo normalization)
+    prices = sample.instance[1][1, :] .* 10
+    N = length(prices)
+    return Plots.bar(
+        1:N,
+        prices;
+        legend=false,
+        xlabel="Item",
+        ylabel="Price",
+        title="Instance (item prices) — step $(length(sample.instance[2]) + 1)",
+        color=:steelblue,
+        kwargs...,
+    )
+end
+
+function plot_solution(::DynamicAssortmentBenchmark, sample::DataSample; kwargs...)
+    prices = sample.instance[1][1, :] .* 10
+    y = sample.y  # BitVector, selected items
+    N = length(prices)
+    colors = [y[i] ? :seagreen : :lightgray for i in 1:N]
+    return Plots.bar(
+        1:N,
+        prices;
+        legend=false,
+        xlabel="Item",
+        ylabel="Price",
+        title="Assortment (green = offered) — step $(length(sample.instance[2]) + 1)",
+        color=colors,
+        kwargs...,
+    )
+end
+
+function plot_trajectory(
+    bench::DynamicAssortmentBenchmark,
+    trajectory::Vector{<:DataSample};
+    max_steps=6,
+    cols=3,
+    kwargs...,
+)
+    n = min(length(trajectory), max_steps)
+    rows = ceil(Int, n / cols)
+    steps = round.(Int, range(1, length(trajectory); length=n))
+    plots = [plot_solution(bench, trajectory[t]) for t in steps]
+    return Plots.plot(
+        plots...; layout=(rows, cols), size=(cols * 300, rows * 250), kwargs...
+    )
+end
diff --git a/ext/plots/maintenance_plots.jl b/ext/plots/maintenance_plots.jl
new file mode 100644
index 0000000..3a95e82
--- /dev/null
+++ b/ext/plots/maintenance_plots.jl
@@ -0,0 +1,54 @@
+has_visualization(::MaintenanceBenchmark) = true
+
+function plot_instance(bench::MaintenanceBenchmark, sample::DataSample; kwargs...)
+    # sample.instance = degradation_state (Vector{Int}, values 1..n)
+    state = sample.instance
+    N = length(state)
+    n = bench.n
+    return Plots.bar(
+        1:N,
+        state;
+        legend=false,
+        xlabel="Component",
+        ylabel="Degradation level",
+        title="Instance (degradation state)",
+        ylim=(0, n + 0.5),
+        color=:steelblue,
+        kwargs...,
+    )
+end
+
+function plot_solution(bench::MaintenanceBenchmark, sample::DataSample; kwargs...)
+    state = sample.instance
+    y = sample.y  # BitVector, maintained components
+    N = length(state)
+    n = bench.n
+    colors = [y[i] ? :seagreen : (state[i] == n ? :firebrick : :steelblue) for i in 1:N]
+    labels = ["comp $i$(y[i] ? " ✓" : "")" for i in 1:N]
+    return Plots.bar(
+        labels,
+        state;
+        legend=false,
+        ylabel="Degradation level",
+        title="Solution (green = maintained, red = failed)",
+        ylim=(0, n + 0.5),
+        color=colors,
+        kwargs...,
+    )
+end
+
+function plot_trajectory(
+    bench::MaintenanceBenchmark,
+    trajectory::Vector{<:DataSample};
+    max_steps=6,
+    cols=3,
+    kwargs...,
+)
+    n = min(length(trajectory), max_steps)
+    rows = ceil(Int, n / cols)
+    steps = round.(Int, range(1, length(trajectory); length=n))
+    plots = [plot_solution(bench, trajectory[t]) for t in steps]
+    return Plots.plot(
+        plots...; layout=(rows, cols), size=(cols * 300, rows * 250), kwargs...
+    )
+end
diff --git a/ext/plots/portfolio_plots.jl b/ext/plots/portfolio_plots.jl
new file mode 100644
index 0000000..77c3a7e
--- /dev/null
+++ b/ext/plots/portfolio_plots.jl
@@ -0,0 +1,41 @@
+has_visualization(::PortfolioOptimizationBenchmark) = true
+
+function plot_instance(::PortfolioOptimizationBenchmark, sample::DataSample; kwargs...)
+    θ = sample.θ
+    d = length(θ)
+    return Plots.bar(
+        1:d,
+        θ;
+        legend=false,
+        xlabel="Asset",
+        ylabel="Expected return",
+        title="Instance (expected returns θ)",
+        color=:steelblue,
+        kwargs...,
+    )
+end
+
+function plot_solution(::PortfolioOptimizationBenchmark, sample::DataSample; kwargs...)
+    θ = sample.θ
+    y = sample.y
+    d = length(θ)
+    p1 = Plots.bar(
+        1:d,
+        θ;
+        legend=false,
+        xlabel="Asset",
+        ylabel="Expected return",
+        title="Expected returns θ",
+        color=:steelblue,
+    )
+    p2 = Plots.bar(
+        1:d,
+        y;
+        legend=false,
+        xlabel="Asset",
+        ylabel="Portfolio weight",
+        title="Portfolio weights y",
+        color=:seagreen,
+    )
+    return Plots.plot(p1, p2; layout=(1, 2), size=(800, 300), kwargs...)
+end
diff --git a/ext/plots/ranking_plots.jl b/ext/plots/ranking_plots.jl
new file mode 100644
index 0000000..dc50b2d
--- /dev/null
+++ b/ext/plots/ranking_plots.jl
@@ -0,0 +1,35 @@
+has_visualization(::RankingBenchmark) = true
+
+function plot_instance(::RankingBenchmark, sample::DataSample; kwargs...)
+    θ = sample.θ
+    n = length(θ)
+    return Plots.bar(
+        1:n,
+        θ;
+        legend=false,
+        xlabel="Item",
+        ylabel="Cost",
+        title="Instance (costs θ)",
+        color=:steelblue,
+        kwargs...,
+    )
+end
+
+function plot_solution(::RankingBenchmark, sample::DataSample; kwargs...)
+    θ = sample.θ
+    y = sample.y  # y[i] = rank of item i (1 = best)
+    n = length(θ)
+    # Color by rank: rank 1 (best) in dark blue, rank n (worst) in light
+    palette = Plots.cgrad(:Blues, n; rev=true, categorical=true)
+    colors = [palette[y[i]] for i in 1:n]
+    return Plots.bar(
+        1:n,
+        θ;
+        legend=false,
+        xlabel="Item",
+        ylabel="Cost",
+        title="Solution (color = rank, dark = best)",
+        color=colors,
+        kwargs...,
+    )
+end
diff --git a/ext/plots/shortest_path_plots.jl b/ext/plots/shortest_path_plots.jl
new file mode 100644
index 0000000..2a7fcf8
--- /dev/null
+++ b/ext/plots/shortest_path_plots.jl
@@ -0,0 +1,88 @@
+import Graphs: edges, src, dst
+
+has_visualization(::FixedSizeShortestPathBenchmark) = true
+
+"""
+Map edge weights to a (rows × cols) vertex weight matrix by averaging incident edge weights,
+and return a boolean (rows × cols) matrix marking vertices on the path.
+"""
+function _grid_matrices(bench::FixedSizeShortestPathBenchmark, θ, y)
+    rows, cols = bench.grid_size
+    n_v = rows * cols
+    g = bench.graph
+
+    # Vertex weights: mean of absolute weights of incident edges
+    v_weights = zeros(Float64, n_v)
+    v_counts = zeros(Int, n_v)
+    for (i, e) in enumerate(edges(g))
+        v_weights[src(e)] += abs(θ[i])
+        v_counts[src(e)] += 1
+        v_weights[dst(e)] += abs(θ[i])
+        v_counts[dst(e)] += 1
+    end
+    v_weights ./= max.(v_counts, 1)
+
+    # Path vertices
+    on_path = falses(n_v)
+    for (i, e) in enumerate(edges(g))
+        if y[i]
+            on_path[src(e)] = true
+            on_path[dst(e)] = true
+        end
+    end
+
+    # Reshape to (rows, cols): vertex v → row ceil(v/cols), col ((v-1)%cols)+1
+    weight_grid = reshape(v_weights, cols, rows)'
+    path_grid = reshape(on_path, cols, rows)'
+    return weight_grid, path_grid
+end
+
+function plot_instance(bench::FixedSizeShortestPathBenchmark, sample::DataSample; kwargs...)
+    weight_grid, _ = _grid_matrices(bench, sample.θ, falses(length(sample.θ)))
+    return Plots.heatmap(
+        weight_grid;
+        yflip=true,
+        aspect_ratio=:equal,
+        title="Edge weights (per vertex)",
+        colorbar=true,
+        kwargs...,
+    )
+end
+
+function plot_solution(bench::FixedSizeShortestPathBenchmark, sample::DataSample; kwargs...)
+    weight_grid, path_grid = _grid_matrices(bench, sample.θ, sample.y)
+    rows, cols = bench.grid_size
+
+    p1 = Plots.heatmap(
+        weight_grid;
+        yflip=true,
+        aspect_ratio=:equal,
+        title="Edge weights",
+        colorbar=true,
+        framestyle=:none,
+    )
+
+    p2 = Plots.heatmap(
+        weight_grid;
+        yflip=true,
+        aspect_ratio=:equal,
+        title="Shortest path",
+        colorbar=false,
+        framestyle=:none,
+        color=:Blues,
+    )
+    # Highlight path vertices with scatter
+    path_xs = Int[]
+    path_ys = Int[]
+    for r in 1:rows, c in 1:cols
+        if path_grid[r, c]
+            push!(path_xs, c)
+            push!(path_ys, r)
+        end
+    end
+    Plots.scatter!(
+        p2, path_xs, path_ys; color=:white, markersize=6, markerstrokewidth=0, label=false
+    )
+
+    return Plots.plot(p1, p2; layout=(1, 2), size=(700, 320), kwargs...)
+end
diff --git a/ext/plots/subset_selection_plots.jl b/ext/plots/subset_selection_plots.jl
new file mode 100644
index 0000000..40e778b
--- /dev/null
+++ b/ext/plots/subset_selection_plots.jl
@@ -0,0 +1,33 @@
+has_visualization(::SubsetSelectionBenchmark) = true
+
+function plot_instance(::SubsetSelectionBenchmark, sample::DataSample; kwargs...)
+    θ = sample.θ
+    n = length(θ)
+    return Plots.bar(
+        1:n,
+        θ;
+        legend=false,
+        xlabel="Item",
+        ylabel="Value",
+        title="Instance (values θ)",
+        color=:steelblue,
+        kwargs...,
+    )
+end
+
+function plot_solution(::SubsetSelectionBenchmark, sample::DataSample; kwargs...)
+    θ = sample.θ
+    y = sample.y  # y[i] = true if item i is selected
+    n = length(θ)
+    colors = [y[i] ? :seagreen : :lightgray for i in 1:n]
+    return Plots.bar(
+        1:n,
+        θ;
+        legend=false,
+        xlabel="Item",
+        ylabel="Value",
+        title="Solution (selected items in green)",
+        color=colors,
+        kwargs...,
+    )
+end
diff --git a/src/ContextualStochasticArgmax/ContextualStochasticArgmax.jl b/src/ContextualStochasticArgmax/ContextualStochasticArgmax.jl
index 49d22e2..5a84825 100644
--- a/src/ContextualStochasticArgmax/ContextualStochasticArgmax.jl
+++ b/src/ContextualStochasticArgmax/ContextualStochasticArgmax.jl
@@ -121,6 +121,16 @@ include("policies.jl")
 """
 $TYPEDSIGNATURES
 
+Return the named baseline policies for [`ContextualStochasticArgmaxBenchmark`](@ref).
+Each policy has signature `(ctx_sample, scenarios) -> Vector{DataSample}`.
+"""
+function Utils.generate_baseline_policies(::ContextualStochasticArgmaxBenchmark)
+    return (; saa=Policy("SAA", "argmax of mean scenarios", csa_saa_policy))
+end
+
+"""
+$TYPEDSIGNATURES
+
 Generates the anticipative solver for the benchmark.
 """
 function Utils.generate_anticipative_solver(::ContextualStochasticArgmaxBenchmark)
diff --git a/src/ContextualStochasticArgmax/policies.jl b/src/ContextualStochasticArgmax/policies.jl
index 1dc2d28..2cf7ae5 100644
--- a/src/ContextualStochasticArgmax/policies.jl
+++ b/src/ContextualStochasticArgmax/policies.jl
@@ -17,16 +17,6 @@ function csa_saa_policy(ctx_sample, scenarios)
     ]
 end
 
-"""
-$TYPEDSIGNATURES
-
-Return the named baseline policies for [`ContextualStochasticArgmaxBenchmark`](@ref).
-Each policy has signature `(ctx_sample, scenarios) -> Vector{DataSample}`.
-"""
-function Utils.generate_baseline_policies(::ContextualStochasticArgmaxBenchmark)
-    return (; saa=Policy("SAA", "argmax of mean scenarios", csa_saa_policy))
-end
-
 """
 $TYPEDEF