Python Code

# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# read data
data_raw = pd.read_csv("../posts/2024-10-02-ts-fundamentals-whats-a-time-series/example_ts_data.csv")

data_raw = (
    # select columns
    data_raw[["Country", "Product", "Date", "Revenue"]]
    # change data types
    .assign(
        Date = pd.to_datetime(data_raw["Date"]), 
        Revenue = pd.to_numeric(data_raw["Revenue"])
    )
)

# print the first few rows
print(data_raw.head())

# filter on specific series
us_ic_raw = data_raw[(data_raw["Country"] == "United States") & (data_raw["Product"] == "Ice Cream")]

us_ic_raw.set_index("Date", inplace=True)

print(us_ic_raw.head())

# plot the data
plt.figure(figsize=(10, 6))
plt.plot(us_ic_raw.index, us_ic_raw["Revenue"], label="Ice Cream Revenue")
plt.xlabel("Date")
plt.ylabel("Revenue")
plt.title("Ice Cream Revenue in United States")
plt.legend()

# save the plot
# plt.savefig("chart1", dpi = 300, bbox_inches = "tight")

# copy the data 
us_ic_diff = us_ic_raw.copy()

# difference the revenue data 
us_ic_diff["Revenue"] = us_ic_diff["Revenue"].diff()

# plot the differenced data
plt.figure(figsize=(10, 6))
plt.plot(us_ic_diff.index, us_ic_diff["Revenue"], label="First Order Difference", color='orange')
plt.xlabel("Date")
plt.ylabel("Differenced Revenue")
plt.title("Differenced Ice Cream Revenue in United States")
plt.legend()

# save the plot
# plt.savefig("chart2", dpi = 300, bbox_inches = "tight")

# copy the data again for second order differencing
us_ic_diff2 = us_ic_raw.copy()

# take a second order difference 
us_ic_diff2 = us_ic_diff2["Revenue"].diff().diff()

# plot the differenced data
plt.figure(figsize=(10, 6))
plt.plot(us_ic_diff2.index, us_ic_diff2, label="Second Order Difference", color='orange')
plt.xlabel("Date")
plt.ylabel("Second Order Difference")
plt.title("Second Order Differenced Ice Cream Revenue in United States")
plt.legend()

# save the plot
# plt.savefig("chart3", dpi = 300, bbox_inches = "tight")

# copy the data again for seasonal differencing
us_ic_seasonal_diff = us_ic_raw.copy()

# take a seasonal difference (12 months for monthly data)
us_ic_seasonal_diff["Revenue"] = us_ic_seasonal_diff["Revenue"].diff(12)

# plot the seasonal differenced data
plt.figure(figsize=(10, 6))
plt.plot(us_ic_seasonal_diff.index, us_ic_seasonal_diff["Revenue"], label="Seasonal Difference", color='orange')
plt.xlabel("Date")
plt.ylabel("Seasonal Differenced Revenue")
plt.title("Seasonally Differenced Ice Cream Revenue in United States")
plt.legend()

# save the plot
# plt.savefig("chart4", dpi = 300, bbox_inches = "tight")

# apply a seasonal difference of 12 months to the original data and then a first order difference
us_ic_seasonal_diff_first = us_ic_raw.copy()
us_ic_seasonal_diff_first["Revenue"] = us_ic_seasonal_diff_first["Revenue"].diff(12).diff()

# plot the seasonal differenced and first order differenced data
plt.figure(figsize=(10, 6))
plt.plot(us_ic_seasonal_diff_first.index, us_ic_seasonal_diff_first["Revenue"], label="Seasonal + First Order Difference", color='orange')
plt.xlabel("Date")
plt.ylabel("Seasonal + First Order Differenced Revenue")
plt.title("Seasonally and First Order Differenced Ice Cream Revenue in United States")
plt.legend()

# save the plot
# plt.savefig("chart5", dpi = 300, bbox_inches = "tight")

# test if original time series is stationary using unit root test
from statsmodels.tsa.stattools import kpss

def kpss_test(timeseries, regression='c'):
    statistic, p_value, lags, critical_values = kpss(timeseries, regression=regression)
    
    print(f'KPSS Statistic: {statistic}')
    print(f'p-value: {p_value}')
    print(f'Num Lags Used: {lags}')
    print('Critical Values:')
    for key, value in critical_values.items():
        print(f'   {key}: {value}')
    
    if p_value < 0.05:
        print("\nResult: Series is non-stationary (reject the null hypothesis)")
    else:
        print("\nResult: Series is stationary (fail to reject the null hypothesis)")

# apply the KPSS test to the original time series
kpss_test(us_ic_raw["Revenue"], regression='c')

# apply the KPSS test to the first order differenced time series, dropping the first row (NaN value)
us_ic_diff = us_ic_diff.dropna()
kpss_test(us_ic_diff["Revenue"], regression='c')

# apply the KPSS test to the second order differenced time series, dropping the first two rows (NaN values)
us_ic_diff2 = us_ic_diff2.dropna()
kpss_test(us_ic_diff2, regression='c')

# apply the KPSS test to the seasonal differenced time series, dropping the first 12 rows (NaN values)
us_ic_seasonal_diff = us_ic_seasonal_diff.dropna()
kpss_test(us_ic_seasonal_diff["Revenue"], regression='c')

# apply the KPSS test to the seasonal and first order differenced time series, dropping the first 13 rows (12 for seasonal and 1 for first order)
us_ic_seasonal_diff_first = us_ic_seasonal_diff_first.dropna()
kpss_test(us_ic_seasonal_diff_first["Revenue"], regression='c')