# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as pltPython Code
# read data
data_raw = pd.read_csv("../posts/2024-10-02-ts-fundamentals-whats-a-time-series/example_ts_data.csv")
data_raw = (
# select columns
data_raw[["Country", "Product", "Date", "Revenue"]]
# change data types
.assign(
Date = pd.to_datetime(data_raw["Date"]),
Revenue = pd.to_numeric(data_raw["Revenue"])
)
)
# print the first few rows
print(data_raw.head())# filter on specific series
us_ic_raw = data_raw[(data_raw["Country"] == "United States") & (data_raw["Product"] == "Ice Cream")]
us_ic_raw.set_index("Date", inplace=True)
print(us_ic_raw.head())
# plot the data
plt.figure(figsize=(10, 6))
plt.plot(us_ic_raw.index, us_ic_raw["Revenue"], label="Ice Cream Revenue")
plt.xlabel("Date")
plt.ylabel("Revenue")
plt.title("Ice Cream Revenue in United States")
plt.legend()
# save the plot
# plt.savefig("chart1", dpi = 300, bbox_inches = "tight")# copy the data
us_ic_diff = us_ic_raw.copy()
# difference the revenue data
us_ic_diff["Revenue"] = us_ic_diff["Revenue"].diff()
# plot the differenced data
plt.figure(figsize=(10, 6))
plt.plot(us_ic_diff.index, us_ic_diff["Revenue"], label="First Order Difference", color='orange')
plt.xlabel("Date")
plt.ylabel("Differenced Revenue")
plt.title("Differenced Ice Cream Revenue in United States")
plt.legend()
# save the plot
# plt.savefig("chart2", dpi = 300, bbox_inches = "tight")# copy the data again for second order differencing
us_ic_diff2 = us_ic_raw.copy()
# take a second order difference
us_ic_diff2 = us_ic_diff2["Revenue"].diff().diff()
# plot the differenced data
plt.figure(figsize=(10, 6))
plt.plot(us_ic_diff2.index, us_ic_diff2, label="Second Order Difference", color='orange')
plt.xlabel("Date")
plt.ylabel("Second Order Difference")
plt.title("Second Order Differenced Ice Cream Revenue in United States")
plt.legend()
# save the plot
# plt.savefig("chart3", dpi = 300, bbox_inches = "tight")# copy the data again for seasonal differencing
us_ic_seasonal_diff = us_ic_raw.copy()
# take a seasonal difference (12 months for monthly data)
us_ic_seasonal_diff["Revenue"] = us_ic_seasonal_diff["Revenue"].diff(12)
# plot the seasonal differenced data
plt.figure(figsize=(10, 6))
plt.plot(us_ic_seasonal_diff.index, us_ic_seasonal_diff["Revenue"], label="Seasonal Difference", color='orange')
plt.xlabel("Date")
plt.ylabel("Seasonal Differenced Revenue")
plt.title("Seasonally Differenced Ice Cream Revenue in United States")
plt.legend()
# save the plot
# plt.savefig("chart4", dpi = 300, bbox_inches = "tight")# apply a seasonal difference of 12 months to the original data and then a first order difference
us_ic_seasonal_diff_first = us_ic_raw.copy()
us_ic_seasonal_diff_first["Revenue"] = us_ic_seasonal_diff_first["Revenue"].diff(12).diff()
# plot the seasonal differenced and first order differenced data
plt.figure(figsize=(10, 6))
plt.plot(us_ic_seasonal_diff_first.index, us_ic_seasonal_diff_first["Revenue"], label="Seasonal + First Order Difference", color='orange')
plt.xlabel("Date")
plt.ylabel("Seasonal + First Order Differenced Revenue")
plt.title("Seasonally and First Order Differenced Ice Cream Revenue in United States")
plt.legend()
# save the plot
# plt.savefig("chart5", dpi = 300, bbox_inches = "tight")# test if original time series is stationary using unit root test
from statsmodels.tsa.stattools import kpss
def kpss_test(timeseries, regression='c'):
statistic, p_value, lags, critical_values = kpss(timeseries, regression=regression)
print(f'KPSS Statistic: {statistic}')
print(f'p-value: {p_value}')
print(f'Num Lags Used: {lags}')
print('Critical Values:')
for key, value in critical_values.items():
print(f' {key}: {value}')
if p_value < 0.05:
print("\nResult: Series is non-stationary (reject the null hypothesis)")
else:
print("\nResult: Series is stationary (fail to reject the null hypothesis)")
# apply the KPSS test to the original time series
kpss_test(us_ic_raw["Revenue"], regression='c')# apply the KPSS test to the first order differenced time series, dropping the first row (NaN value)
us_ic_diff = us_ic_diff.dropna()
kpss_test(us_ic_diff["Revenue"], regression='c')# apply the KPSS test to the second order differenced time series, dropping the first two rows (NaN values)
us_ic_diff2 = us_ic_diff2.dropna()
kpss_test(us_ic_diff2, regression='c')# apply the KPSS test to the seasonal differenced time series, dropping the first 12 rows (NaN values)
us_ic_seasonal_diff = us_ic_seasonal_diff.dropna()
kpss_test(us_ic_seasonal_diff["Revenue"], regression='c')# apply the KPSS test to the seasonal and first order differenced time series, dropping the first 13 rows (12 for seasonal and 1 for first order)
us_ic_seasonal_diff_first = us_ic_seasonal_diff_first.dropna()
kpss_test(us_ic_seasonal_diff_first["Revenue"], regression='c')