# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
Python Code
# read data
= pd.read_csv("../posts/2024-10-02-ts-fundamentals-whats-a-time-series/example_ts_data.csv")
data_raw
= (
data_raw # select columns
"Country", "Product", "Date", "Revenue"]]
data_raw[[# change data types
.assign(= pd.to_datetime(data_raw["Date"]),
Date = pd.to_numeric(data_raw["Revenue"])
Revenue
)
)
# print the first few rows
print(data_raw.head())
# filter on specific series
= data_raw[(data_raw["Country"] == "United States") & (data_raw["Product"] == "Ice Cream")]
us_ic_raw
"Date", inplace=True)
us_ic_raw.set_index(
print(us_ic_raw.head())
# plot the data
=(10, 6))
plt.figure(figsize"Revenue"], label="Ice Cream Revenue")
plt.plot(us_ic_raw.index, us_ic_raw["Date")
plt.xlabel("Revenue")
plt.ylabel("Ice Cream Revenue in United States")
plt.title(
plt.legend()
# save the plot
# plt.savefig("chart1", dpi = 300, bbox_inches = "tight")
# copy the data
= us_ic_raw.copy()
us_ic_diff
# difference the revenue data
"Revenue"] = us_ic_diff["Revenue"].diff()
us_ic_diff[
# plot the differenced data
=(10, 6))
plt.figure(figsize"Revenue"], label="First Order Difference", color='orange')
plt.plot(us_ic_diff.index, us_ic_diff["Date")
plt.xlabel("Differenced Revenue")
plt.ylabel("Differenced Ice Cream Revenue in United States")
plt.title(
plt.legend()
# save the plot
# plt.savefig("chart2", dpi = 300, bbox_inches = "tight")
# copy the data again for second order differencing
= us_ic_raw.copy()
us_ic_diff2
# take a second order difference
= us_ic_diff2["Revenue"].diff().diff()
us_ic_diff2
# plot the differenced data
=(10, 6))
plt.figure(figsize="Second Order Difference", color='orange')
plt.plot(us_ic_diff2.index, us_ic_diff2, label"Date")
plt.xlabel("Second Order Difference")
plt.ylabel("Second Order Differenced Ice Cream Revenue in United States")
plt.title(
plt.legend()
# save the plot
# plt.savefig("chart3", dpi = 300, bbox_inches = "tight")
# copy the data again for seasonal differencing
= us_ic_raw.copy()
us_ic_seasonal_diff
# take a seasonal difference (12 months for monthly data)
"Revenue"] = us_ic_seasonal_diff["Revenue"].diff(12)
us_ic_seasonal_diff[
# plot the seasonal differenced data
=(10, 6))
plt.figure(figsize"Revenue"], label="Seasonal Difference", color='orange')
plt.plot(us_ic_seasonal_diff.index, us_ic_seasonal_diff["Date")
plt.xlabel("Seasonal Differenced Revenue")
plt.ylabel("Seasonally Differenced Ice Cream Revenue in United States")
plt.title(
plt.legend()
# save the plot
# plt.savefig("chart4", dpi = 300, bbox_inches = "tight")
# apply a seasonal difference of 12 months to the original data and then a first order difference
= us_ic_raw.copy()
us_ic_seasonal_diff_first "Revenue"] = us_ic_seasonal_diff_first["Revenue"].diff(12).diff()
us_ic_seasonal_diff_first[
# plot the seasonal differenced and first order differenced data
=(10, 6))
plt.figure(figsize"Revenue"], label="Seasonal + First Order Difference", color='orange')
plt.plot(us_ic_seasonal_diff_first.index, us_ic_seasonal_diff_first["Date")
plt.xlabel("Seasonal + First Order Differenced Revenue")
plt.ylabel("Seasonally and First Order Differenced Ice Cream Revenue in United States")
plt.title(
plt.legend()
# save the plot
# plt.savefig("chart5", dpi = 300, bbox_inches = "tight")
# test if original time series is stationary using unit root test
from statsmodels.tsa.stattools import kpss
def kpss_test(timeseries, regression='c'):
= kpss(timeseries, regression=regression)
statistic, p_value, lags, critical_values
print(f'KPSS Statistic: {statistic}')
print(f'p-value: {p_value}')
print(f'Num Lags Used: {lags}')
print('Critical Values:')
for key, value in critical_values.items():
print(f' {key}: {value}')
if p_value < 0.05:
print("\nResult: Series is non-stationary (reject the null hypothesis)")
else:
print("\nResult: Series is stationary (fail to reject the null hypothesis)")
# apply the KPSS test to the original time series
"Revenue"], regression='c') kpss_test(us_ic_raw[
# apply the KPSS test to the first order differenced time series, dropping the first row (NaN value)
= us_ic_diff.dropna()
us_ic_diff "Revenue"], regression='c') kpss_test(us_ic_diff[
# apply the KPSS test to the second order differenced time series, dropping the first two rows (NaN values)
= us_ic_diff2.dropna()
us_ic_diff2 ='c') kpss_test(us_ic_diff2, regression
# apply the KPSS test to the seasonal differenced time series, dropping the first 12 rows (NaN values)
= us_ic_seasonal_diff.dropna()
us_ic_seasonal_diff "Revenue"], regression='c') kpss_test(us_ic_seasonal_diff[
# apply the KPSS test to the seasonal and first order differenced time series, dropping the first 13 rows (12 for seasonal and 1 for first order)
= us_ic_seasonal_diff_first.dropna()
us_ic_seasonal_diff_first "Revenue"], regression='c') kpss_test(us_ic_seasonal_diff_first[