# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.feature_selection import mutual_info_regression
Python Code
# read data
= pd.read_csv("../posts/2024-10-02-ts-fundamentals-whats-a-time-series/example_ts_data.csv")
data_raw
= (
data_raw # select columns
"Country", "Product", "Date", "Revenue"]]
data_raw[[# change data types
.assign(= pd.to_datetime(data_raw["Date"]),
Date = pd.to_numeric(data_raw["Revenue"])
Revenue
)
)
# print the first few rows
print(data_raw.head())
# filter on specific series
= data_raw[(data_raw["Country"] == "United States") & (data_raw["Product"] == "Cookies")]
us_ck_raw
"Date", inplace=True)
us_ck_raw.set_index(
print(us_ck_raw.head())
# plot the data
=(10, 6))
plt.figure(figsize"Revenue"], label="Cookies Revenue")
plt.plot(us_ck_raw.index, us_ck_raw["Date")
plt.xlabel("Revenue")
plt.ylabel("Cookies Revenue in the United States")
plt.title( plt.legend()
# Set a random seed for reproducibility
15)
random.seed(
= us_ck_raw.copy()
us_ck_corr
# Generate a new variable with strong correlation
= 0.9
correlation_target = us_ck_corr.shape[0]
n = np.random.randn(n)
noise
# Orthogonalize noise to the original variable to ensure independence
= noise - np.dot(noise, us_ck_corr["Revenue"]) / np.dot(us_ck_corr["Revenue"], us_ck_corr["Revenue"]) * us_ck_corr["Revenue"]
noise
# Scale the orthogonalized noise to match the desired correlation
= noise / np.linalg.norm(noise) * np.sqrt(1 - correlation_target**2) * np.linalg.norm(us_ck_corr["Revenue"])
noise
# Create the new variable
"xreg1"] = correlation_target * us_ck_corr["Revenue"] + noise
us_ck_corr[
# Verify the correlation
= us_ck_corr["Revenue"].corr(us_ck_corr["xreg1"])
correlation print(f"Correlation between Original and New_Var: {correlation:.4f}")
# create a variable that has a weak correlation to the revenue column, create random values between 1 and 100
"xreg2"] = random.sample(range(1, 100), us_ck_corr.shape[0])
us_ck_corr[
# print the first few rows, formatting numbers to 2 decimal places
print(us_ck_corr.head(10).round(2))
# plot the data with the new variables as dotted lines
=(10, 6))
plt.figure(figsize"Revenue"], label="Cookies Revenue")
plt.plot(us_ck_corr.index, us_ck_corr["xreg1"], label="xreg1", linestyle = "dotted")
plt.plot(us_ck_corr.index, us_ck_corr["xreg2"], label="xreg2", linestyle = "dotted")
plt.plot(us_ck_corr.index, us_ck_corr["Date")
plt.xlabel("Revenue")
plt.ylabel("Cookies Revenue in the United States")
plt.title(
plt.legend()
# save the plot
# plt.savefig("chart1", dpi = 300, bbox_inches = "tight")
# calculate the correlation between the target variable and the new variables, dropping the date, country, and product columns
= us_ck_corr.drop(columns=["Country", "Product"]).corr()
correlation
# create a simple table to display the correlation values
= correlation.stack().reset_index()
correlation_table = ["Variable 1", "Variable 2", "Correlation"]
correlation_table.columns = correlation_table[correlation_table["Variable 1"] == "Revenue"]
correlation_table = correlation_table[correlation_table["Variable 2"] != "Revenue"]
correlation_table
# print the correlation table, rounding the values to 2 decimal places
print(correlation_table.round(2))
# calculate lags of the xreg1 and xreg2 columns. Create 1, 2, 3, 6, 9, 12 lags
"xreg1_lag_1"] = us_ck_corr["xreg1"].shift(1)
us_ck_corr["xreg1_lag_2"] = us_ck_corr["xreg1"].shift(2)
us_ck_corr["xreg1_lag_3"] = us_ck_corr["xreg1"].shift(3)
us_ck_corr["xreg1_lag_6"] = us_ck_corr["xreg1"].shift(6)
us_ck_corr["xreg1_lag_9"] = us_ck_corr["xreg1"].shift(9)
us_ck_corr["xreg1_lag_12"] = us_ck_corr["xreg1"].shift(12)
us_ck_corr[
"xreg2_lag_1"] = us_ck_corr["xreg2"].shift(1)
us_ck_corr["xreg2_lag_2"] = us_ck_corr["xreg2"].shift(2)
us_ck_corr["xreg2_lag_3"] = us_ck_corr["xreg2"].shift(3)
us_ck_corr["xreg2_lag_6"] = us_ck_corr["xreg2"].shift(6)
us_ck_corr["xreg2_lag_9"] = us_ck_corr["xreg2"].shift(9)
us_ck_corr["xreg2_lag_12"] = us_ck_corr["xreg2"].shift(12)
us_ck_corr[
# calculate the correlation between the target variable and the new variables, dropping the date, country, and product columns
= us_ck_corr.drop(columns=["Country", "Product"]).corr()
relation
# create a simple table to display the correlation values
= relation.stack().reset_index()
lag_table = ["Variable 1", "Variable 2", "Correlation"]
lag_table.columns = lag_table[lag_table["Variable 1"] == "Revenue"]
lag_table = lag_table[lag_table["Variable 2"] != "Revenue"]
lag_table
# print the correlation table, rounding the values to 2 decimal places
print(lag_table.round(2))
# create mutual information function
def calculate_mutual_information(x, y, n_neighbors=3, discrete_features=False, random_state=123):
"""
Calculates the Mutual Information (MI) score between two variables using mutual_info_regression.
Parameters:
x (array-like): First variable (time series or feature).
y (array-like): Second variable (time series or feature).
n_neighbors (int): Number of neighbors to use for density estimation (default is 3).
discrete_features (bool): Whether the features are discrete (default is False).
Returns:
float: Mutual Information (MI) score.
"""
# Ensure inputs are numpy arrays
= np.array(x).reshape(-1, 1) # Reshape x to 2D as required by mutual_info_regression
x = np.array(y)
y
# Calculate Mutual Information
= mutual_info_regression(x, y, n_neighbors=n_neighbors, discrete_features=discrete_features, random_state=random_state)
mi_score
return mi_score[0] # Return the MI score (a single value since x has one feature)
# copy the data
= us_ck_corr.copy()
us_ck_mi
# drop NaN values
=True)
us_ck_mi.dropna(inplace
# Define variables and their lags
= ["xreg1", "xreg2"]
variables = [0, 1, 2, 3, 6, 9, 12]
lags
# Initialize results
= []
mi_results
# Calculate MI dynamically for variables and their lags
for var in variables:
for lag in lags:
= f"{var}_lag_{lag}" if lag != 0 else var
col_name if col_name in us_ck_mi.columns:
= calculate_mutual_information(us_ck_mi[col_name], us_ck_mi["Revenue"])
mi_score "Variable 1": "Revenue", "Variable 2": col_name, "Mutual Information": mi_score})
mi_results.append({
# Create a DataFrame
= pd.DataFrame(mi_results)
mi_table
# Print the table rounded to 2 decimal places
print(mi_table.round(2))
# create a binary variable that is 1 in the year 2020 and 0 otherwise
= us_ck_corr.copy()
us_ck_binary "COVID_Flag"] = us_ck_binary.index.year == 2020
us_ck_binary["COVID_Flag"] = us_ck_binary["COVID_Flag"].astype(int)
us_ck_binary[
print(us_ck_binary[["Country", "Product", "Revenue", "COVID_Flag"]].head())
# calculate the mutual information between the binary variable and the revenue column
= calculate_mutual_information(us_ck_binary["COVID_Flag"], us_ck_binary["Revenue"])
mi_score
print(f"Mutual Information between COVID Flag and Revenue: {mi_score:.4f}")
# create a randrom variable that is 1 with a probability of 0.5 and 0 otherwise
= us_ck_corr.copy()
us_ck_random "Random_Flag"] = np.random.choice([0, 1], us_ck_random.shape[0], p=[0.5, 0.5])
us_ck_random[
print(us_ck_random[["Country", "Product", "Revenue", "Random_Flag"]].head())
# calculate the mutual information between the random variable and the revenue column
= calculate_mutual_information(us_ck_random["Random_Flag"], us_ck_random["Revenue"])
mi_score print(f"Mutual Information between Random Flag and Revenue: {mi_score:.4f}")