# import librariesimport pandas as pdfrom statsforecast import StatsForecastfrom statsforecast.models import AutoARIMAimport matplotlib.pyplot as plt
c:\Users\mftok\OneDrive\Github\mftokic.github.io\.venv\lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
# read datadata_raw = pd.read_csv("../posts/2024-10-02-ts-fundamentals-whats-a-time-series/example_ts_data.csv")data_raw = (# select columns data_raw[["Country", "Product", "Date", "Revenue"]]# change data types .assign( Date = pd.to_datetime(data_raw["Date"]), Revenue = pd.to_numeric(data_raw["Revenue"]) ))# print the first few rowsprint(data_raw.head())
Country Product Date Revenue
0 United States Ice Cream 2019-01-01 10
1 United States Ice Cream 2019-02-01 12
2 United States Ice Cream 2019-03-01 15
3 United States Ice Cream 2019-04-01 9
4 United States Ice Cream 2019-05-01 11
# filter on specific seriesus_ic_raw = data_raw[(data_raw["Country"] =="United States") & (data_raw["Product"] =="Ice Cream")]# create unique idus_ic_raw["unique_id"] = us_ic_raw["Country"] +"_"+ us_ic_raw["Product"]# convert date to datetimeus_ic_raw["Date"] = pd.to_datetime(us_ic_raw["Date"])# plot the dataplt.figure(figsize=(10, 6))plt.plot(us_ic_raw.index, us_ic_raw["Revenue"], label="Ice Cream Revenue")plt.xlabel("Date")plt.ylabel("Revenue")plt.title("Ice Cream Revenue in United States")plt.legend()
C:\Users\mftok\AppData\Local\Temp\ipykernel_3424\2972639449.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
us_ic_raw["unique_id"] = us_ic_raw["Country"] + "_" + us_ic_raw["Product"]
C:\Users\mftok\AppData\Local\Temp\ipykernel_3424\2972639449.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
us_ic_raw["Date"] = pd.to_datetime(us_ic_raw["Date"])
# get final data for forecastingus_ic_clean = us_ic_raw[["unique_id", "Date", "Revenue"]].copy()# set up models to trainsf = StatsForecast( models=[AutoARIMA(season_length=12)], freq='ME',)# fit the model and forecast for 12 months aheadY_hat_df = sf.forecast(df = us_ic_clean, time_col ="Date", target_col ="Revenue", id_col ="unique_id", h=12, level=[95], fitted=True)print(Y_hat_df.head())# convert date to be first of the monthY_hat_df["Date"] = Y_hat_df["Date"].dt.to_period("M").dt.to_timestamp()
unique_id Date AutoARIMA AutoARIMA-lo-95 \
0 United States_Ice Cream 2023-12-31 15.5 13.248834
1 United States_Ice Cream 2024-01-31 15.5 13.248834
2 United States_Ice Cream 2024-02-29 21.5 19.248835
3 United States_Ice Cream 2024-03-31 14.5 12.248834
4 United States_Ice Cream 2024-04-30 18.5 16.248835
AutoARIMA-hi-95
0 17.751165
1 17.751165
2 23.751165
3 16.751165
4 20.751165
# concat both df togetherfuture_fcst_df = pd.concat([us_ic_clean, Y_hat_df], axis=0)# make date the indexfuture_fcst_df.set_index("Date", inplace=True)print(future_fcst_df.tail())
unique_id Revenue AutoARIMA AutoARIMA-lo-95 \
Date
2024-07-01 United States_Ice Cream NaN 22.5 20.248835
2024-08-01 United States_Ice Cream NaN 25.5 23.248835
2024-09-01 United States_Ice Cream NaN 26.5 24.248835
2024-10-01 United States_Ice Cream NaN 37.5 35.248833
2024-11-01 United States_Ice Cream NaN 45.5 43.248833
AutoARIMA-hi-95
Date
2024-07-01 24.751165
2024-08-01 27.751165
2024-09-01 28.751165
2024-10-01 39.751167
2024-11-01 47.751167
# plot the future fcst dataplt.figure(figsize=(10, 6))# plot the original revenue data as line and forecast as dotted lineplt.plot(future_fcst_df.index, future_fcst_df["Revenue"], label="Actual Revenue")plt.plot(future_fcst_df.index, future_fcst_df["AutoARIMA"], label="ARIMA Forecast", linestyle='dotted')# plot the prediction intervals as shaded areasplt.fill_between(future_fcst_df.index, future_fcst_df["AutoARIMA-lo-95"], future_fcst_df["AutoARIMA-hi-95"], color='gray', alpha=0.2, label='95% Prediction Interval')# chart formattingplt.xlabel("Date")plt.ylabel("Revenue")plt.title("Forecast Results for US Ice Cream Revenue")plt.legend()# save the plot# plt.savefig("chart1", dpi = 300, bbox_inches = "tight")
# get fitted values of the historical data# Note: The fitted values are the predicted values for the training dataresidual_values = sf.forecast_fitted_values()# make date the indexresidual_values.set_index("Date", inplace=True)print(residual_values.head())
unique_id Revenue AutoARIMA AutoARIMA-lo-95 \
Date
2019-01-01 United States_Ice Cream 10.0 9.990125 7.738959
2019-02-01 United States_Ice Cream 12.0 11.988250 9.737084
2019-03-01 United States_Ice Cream 15.0 14.985375 12.734209
2019-04-01 United States_Ice Cream 9.0 8.991500 6.740334
2019-05-01 United States_Ice Cream 11.0 10.989625 8.738459
AutoARIMA-hi-95
Date
2019-01-01 12.241291
2019-02-01 14.239416
2019-03-01 17.236542
2019-04-01 11.242666
2019-05-01 13.240791
# plot the historical fitted valuesplt.figure(figsize=(10, 6))# plot the original revenue data as line and forecast as dotted lineplt.plot(residual_values.index, residual_values["Revenue"], label="Actual Revenue")plt.plot(residual_values.index, residual_values["AutoARIMA"], label="Forecasted Revenue", linestyle='dotted')# plot the prediction intervals as shaded areasplt.fill_between(residual_values.index, residual_values["AutoARIMA-lo-95"], residual_values["AutoARIMA-hi-95"], color='gray', alpha=0.2, label='95% Prediction Interval')# chart formattingplt.xlabel("Date")plt.ylabel("Revenue")plt.title("ARIMA Residuals for US Ice Cream Revenue")plt.legend()# save the plot# plt.savefig("chart2", dpi = 300, bbox_inches = "tight")
# calculate residuals and plot the residuals directly as a line chart residuals = residual_values["Revenue"] - residual_values["AutoARIMA"]plt.figure(figsize=(10, 6))plt.plot(residuals.index, residuals, label="Residuals", color='orange')plt.axhline(y=0, color='red', linestyle='--')plt.xlabel("Date")plt.ylabel("Residuals")plt.title("Residuals Over Time")# save the plot# plt.savefig("chart3", dpi = 300, bbox_inches = "tight")
Text(0.5, 1.0, 'Residuals Over Time')
# create histogram of residualsplt.figure(figsize=(10, 6))plt.hist(residuals, bins=6, color='blue', alpha=0.7)plt.axvline(x=0, color='red', linestyle='--')plt.xlabel("Residuals")plt.ylabel("Frequency")plt.title("Histogram of Residuals")# save the plot# plt.savefig("chart4", dpi = 300, bbox_inches = "tight")
Text(0.5, 1.0, 'Histogram of Residuals')
# create chart that puts the actual values on one side and the fitted values on the other as a scatter plotplt.figure(figsize=(10, 6))# plot the actual valuesplt.scatter(residual_values["Revenue"], residual_values["AutoARIMA"], label="Fitted Values", alpha=0.5)# plot the 45 degree lineplt.plot([residual_values["Revenue"].min(), residual_values["Revenue"].max()], [residual_values["Revenue"].min(), residual_values["Revenue"].max()], color='red', linestyle='--', label="45 Degree Line")# chart formattingplt.xlabel("Actual Revenue")plt.ylabel("Fitted Revenue")plt.title("Q-Q Plot of Actual vs Fitted Values")# save the plot# plt.savefig("chart5", dpi = 300, bbox_inches = "tight")
Text(0.5, 1.0, 'Q-Q Plot of Actual vs Fitted Values')
# create ACF plot on the residualsfrom statsmodels.graphics.tsaplots import plot_acfplt.figure(figsize=(10, 6))plot_acf(residuals, lags=24)plt.title("ACF of Residuals")plt.xlabel("Lags")plt.ylabel("Autocorrelation")plt.legend()# save the plot# plt.savefig("chart6", dpi = 300, bbox_inches = "tight")
C:\Users\mftok\AppData\Local\Temp\ipykernel_3424\3336845809.py:8: UserWarning: No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
plt.legend()