diff --git a/pyproject.toml b/pyproject.toml
index 6370e95..7678fcf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "rolch"
-version = "0.1.6"
+version = "0.1.7"
 authors = [
   {name="Simon Hirsch", email="simon.hirsch@stud.uni-due.de"},
   {name="Jonathan Berrisch", email="jonathan.berrisch@uni-due.de"},
diff --git a/src/rolch/online_gamlss.py b/src/rolch/online_gamlss.py
index 06ece7a..aac9638 100644
--- a/src/rolch/online_gamlss.py
+++ b/src/rolch/online_gamlss.py
@@ -8,10 +8,16 @@
     online_coordinate_descent,
     online_coordinate_descent_path,
 )
-from rolch.gram import init_gram, init_inverted_gram, init_y_gram, update_inverted_gram
+from rolch.gram import (
+    init_gram,
+    init_inverted_gram,
+    init_y_gram,
+    update_inverted_gram,
+    init_forget_vector,
+)
 from rolch.information_criteria import select_best_model_by_information_criterion
 from rolch.scaler import OnlineScaler
-from rolch.utils import calculate_effective_training_length
+from rolch.utils import calculate_effective_training_length, online_mean_update
 
 
 class OnlineGamlss:
@@ -135,6 +141,9 @@ def fit_beta(
         iteration_inner,
         param,
     ):
+
+        f = init_forget_vector(self.forget, self.n_obs)
+
         if self.method == "ols":
             lambda_max = None
             lambda_path = None
@@ -143,10 +152,7 @@ def fit_beta(
             beta = (x_gram @ y_gram).flatten()
             residuals = y - X @ beta.T
 
-            if self.method == "ols" or self.intercept_only[param]:
-                rss = np.sum(residuals**2 * w) / np.mean(w)
-            else:
-                rss = np.sum(residuals**2 * w[:, None], axis=0) / np.mean(w)
+            rss = np.sum(residuals**2 * w * f) / np.mean(w * f)
 
         elif (self.method == "lasso") & self.intercept_only[param]:
             lambda_max = None
@@ -167,10 +173,7 @@ def fit_beta(
             )[0]
             residuals = y - X @ beta.T
 
-            if self.method == "ols" or self.intercept_only[param]:
-                rss = np.sum(residuals**2 * w, axis=0) / np.mean(w)
-            elif self.method == "lasso":
-                rss = np.sum(residuals**2 * w[:, None], axis=0) / np.mean(w)
+            rss = np.sum(residuals**2 * w * f, axis=0) / np.mean(w * f)
 
         elif self.method == "lasso":
             intercept = (
@@ -197,10 +200,9 @@ def fit_beta(
 
             residuals = y[:, None] - X @ beta_path.T
 
-            if self.method == "ols" or self.intercept_only[param]:
-                rss = np.sum(residuals**2 * w, axis=0) / np.mean(w)
-            elif self.method == "lasso":
-                rss = np.sum(residuals**2 * w[:, None], axis=0) / np.mean(w)
+            rss = np.sum(residuals**2 * w[:, None] * f[:, None], axis=0) / np.mean(
+                w * f
+            )
 
             model_params_n = np.sum(~np.isclose(beta_path, 0), axis=1)
             best_ic = select_best_model_by_information_criterion(
@@ -228,6 +230,11 @@ def update_beta(
         iteration_inner,
         param,
     ):
+
+        denom = online_mean_update(
+            self.mean_of_weights[param], w, self.forget, self.n_obs
+        )
+
         if self.method == "ols":
             # Not relevant for OLS
             lambda_max = None
@@ -240,7 +247,7 @@ def update_beta(
             rss = (
                 (residuals**2).flatten() * w
                 + (1 - self.forget) * (self.rss[param] * self.mean_of_weights[param])
-            ) / (self.mean_of_weights[param] * (1 - self.forget) + w)
+            ) / denom
 
         elif (self.method == "lasso") & self.intercept_only[param]:
             lambda_max = None
@@ -264,7 +271,7 @@ def update_beta(
             rss = (
                 (residuals**2).flatten() * w
                 + (1 - self.forget) * (self.rss[param] * self.mean_of_weights[param])
-            ) / (self.mean_of_weights[param] * (1 - self.forget) + w)
+            ) / denom
 
         elif self.method == "lasso":
             intercept = (
@@ -293,7 +300,7 @@ def update_beta(
             rss = (
                 (residuals**2).flatten() * w
                 + (1 - self.forget) * (self.rss[param] * self.mean_of_weights[param])
-            ) / (self.mean_of_weights[param] * (1 - self.forget) + w)
+            ) / denom
 
             model_params_n = np.sum(np.isclose(beta_path, 0), axis=1)
             best_ic = select_best_model_by_information_criterion(
diff --git a/src/rolch/utils.py b/src/rolch/utils.py
index b235b6a..6c92225 100644
--- a/src/rolch/utils.py
+++ b/src/rolch/utils.py
@@ -1,4 +1,5 @@
 import sys
+import numpy as np
 
 
 def calculate_asymptotic_training_length(forget: float):
@@ -14,3 +15,21 @@ def calculate_effective_training_length(forget: float, n_obs: int):
         return n_obs
     else:
         return (1 - (1 - forget) ** n_obs) / forget
+
+
+def online_mean_update(avg: float, value: float, forget: float, n_seen: int):
+
+    n_asymmptotic = calculate_asymptotic_training_length(forget)
+    n_eff = calculate_effective_training_length(forget, n_seen)
+
+    forget_scaled = forget * np.maximum(n_asymmptotic / n_eff, 1.0)
+
+    diff = value - avg
+    incr = forget_scaled * diff
+
+    if forget_scaled > 0:
+        new_avg = avg + incr
+    else:
+        new_avg = avg + diff / n_seen
+
+    return new_avg