# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.deterministic import Fourier
from scipy.interpolate import interp1d
Python Code
# read data
= pd.read_csv("../posts/2024-10-02-ts-fundamentals-whats-a-time-series/example_ts_data.csv")
data_raw
= (
data_raw # select columns
"Country", "Product", "Date", "Revenue"]]
data_raw[[# change data types
.assign(= pd.to_datetime(data_raw["Date"]),
Date = pd.to_numeric(data_raw["Revenue"])
Revenue
)
)
# print the first few rows
print(data_raw.head())
# filter on specific series
= data_raw[(data_raw["Country"] == "United States") & (data_raw["Product"] == "Cookies")]
us_ck_raw
"Date", inplace=True)
us_ck_raw.set_index(
print(us_ck_raw.head())
# randomly replace some revenue values with NaN
123)
np.random.seed(
= us_ck_raw.copy()
us_ck_mv
=0.03).index, "Revenue"] = np.nan
us_ck_mv.loc[us_ck_mv.sample(frac
print(us_ck_mv.head())
# plot the series
=(10, 6))
plt.figure(figsize"Revenue"], label = "Cookie Revenue", color = "blue")
plt.plot(us_ck_mv.index, us_ck_raw["US Cookie Revenue - with Missing Values")
plt.title("Date")
plt.xlabel("Revenue")
plt.ylabel(True)
plt.grid(
# save the plot
# plt.savefig("chart1", dpi = 300, bbox_inches = "tight")
# Function to interpolate missing values
def na_interp(ts, freq=None, lambda_=None, use_linear=None):
# Ensure ts is a pandas Series
if not isinstance(ts, pd.Series):
raise ValueError("Input `ts` must be a pandas Series.")
# Handle cases where there are no missing values
if ts.isna().sum() == 0:
return ts
# Frequency determination
if freq is None:
= pd.infer_freq(ts.index) or 1
freq
# Automatic linear interpolation decision
if use_linear is None:
= freq <= 1 or ts.notna().sum() <= 2 * freq
use_linear
# Apply Box-Cox transformation if lambda_ is provided
def boxcox_transform(series, lmbda):
if lmbda is None:
return series
if lmbda == 0:
return np.log(series)
return (series ** lmbda - 1) / lmbda
def inv_boxcox_transform(series, lmbda):
if lmbda is None:
return series
if lmbda == 0:
return np.exp(series)
return (series * lmbda + 1) ** (1 / lmbda)
= ts.copy()
ts_original = ts.astype(float)
ts if lambda_ is not None:
= boxcox_transform(ts, lambda_)
ts
if use_linear:
# Linear interpolation
= ts.interpolate(method='linear', limit_direction='both')
ts_interpolated else:
# Seasonal interpolation
= ts.interpolate(method='linear', limit_direction='both') # Pre-fill gaps for STL
ts_filled = STL(ts_filled, seasonal=freq, robust=True).fit()
stl
# Interpolate seasonally adjusted series
= stl.trend + stl.resid # Seasonally adjusted component
sa
# Convert DatetimeIndex to numeric values for interpolation
= ts.index.astype(int) / 10**9 # Convert to seconds since epoch
idx_numeric = interp1d(
sa_interp_func ~ts.isna()],
idx_numeric[~ts.isna()],
sa[=False,
bounds_error="extrapolate"
fill_value
)= sa_interp_func(idx_numeric)
sa_interp
# Add back the seasonal component
= pd.Series(sa_interp + stl.seasonal, index=ts.index)
ts_interpolated
# Back-transform if Box-Cox was applied
if lambda_ is not None:
= inv_boxcox_transform(ts_interpolated, lambda_)
ts_interpolated
# Check for stability (fallback to linear if unstable)
if not use_linear:
= ts_original.dropna().max() - ts_original.dropna().min()
ts_range if (ts_interpolated.max() > ts_original.max() + 0.5 * ts_range or
min() < ts_original.min() - 0.5 * ts_range):
ts_interpolated.return na_interp(ts_original, freq=freq, lambda_=lambda_, use_linear=True)
return ts_interpolated
# get the series with missing values and convert to float
= us_ck_mv["Revenue"]
us_ck_missing = np.array(us_ck_missing, dtype=float)
us_ck_missing
# get the monthly date range, using first day of month
= pd.date_range(start="2019-01-01", periods=60, freq="M") # Monthly frequency
date_range
# Create a pandas Series
= pd.Series(us_ck_missing, index=date_range)
ts
# Interpolating missing values
= na_interp(ts, freq=11)
ts_interpolated
# Plot the results, with origin on top
=(10, 6))
plt.figure(figsize="Interpolated", color="red", linestyle="--")
plt.plot(ts_interpolated, label="Original", color="blue")
plt.plot(ts, label
plt.legend()True)
plt.grid("US Cookie Revenue - with Missing Values Interpolated")
plt.title("Date")
plt.xlabel("Revenue")
plt.ylabel(
# save the plot
# plt.savefig("chart2", dpi = 300, bbox_inches = "tight")
# STL decomposition
= STL(us_ck_raw["Revenue"], seasonal = 13, period = 12)
stl = stl.fit()
res
# plot the decomposition in a stacked chart
= plt.subplots(4, 1, figsize = (10, 8))
fig, (ax1, ax2, ax3, ax4)
# original data
= "Original", color = "blue")
ax1.plot(res.observed, label "Original Data")
ax1.set_title("Revenue")
ax1.set_ylabel(True)
ax1.grid(
ax1.legend()
# trend
= "Trend", color = "red")
ax2.plot(res.trend, label "Trend")
ax2.set_title("Revenue")
ax2.set_ylabel(True)
ax2.grid(
ax2.legend()
# seasonal
= "Seasonal", color = "green")
ax3.plot(res.seasonal, label "Seasonal")
ax3.set_title("Revenue")
ax3.set_ylabel(True)
ax3.grid(
ax3.legend()
# residual
= "Residual", color = "orange")
ax4.plot(res.resid, label "Residual")
ax4.set_title("Date")
ax4.set_xlabel("Revenue")
ax4.set_ylabel(True)
ax4.grid(
ax4.legend()
# Formatting and final touches
"Date")
plt.xlabel(
plt.tight_layout()
# save the plot
# plt.savefig("chart3", dpi = 300, bbox_inches = "tight")
# calculate outliers based on residuals
= res.resid
residuals = residuals.mean()
residuals_mean = residuals.std()
residuals_std
# calculate the z-score
= (residuals - residuals_mean) / residuals_std
z_score
# identify the outliers
= z_score[abs(z_score) > 3]
outliers
# plot the residuals
=(10, 6))
plt.figure(figsize= "Residuals", color = "orange")
plt.plot(residuals, label # plt.axhline(y = 0, color = "black", linestyle = "--")
= residuals_mean + 3 * residuals_std, color = "red", linestyle = "--")
plt.axhline(y = residuals_mean - 3 * residuals_std, color = "red", linestyle = "--")
plt.axhline(y # plt.scatter(outliers.index, outliers, color = "red", label = "Outliers")
"Residuals with Outliers")
plt.title("Date")
plt.xlabel("Revenue")
plt.ylabel(
plt.legend()True)
plt.grid(
# save the plot
# plt.savefig("chart4", dpi = 300, bbox_inches = "tight")