196 lines
7.0 KiB
Python
Raw Normal View History

import os
import pandas as pd
import numpy as np
from scipy import stats as stats
from scipy.stats import ttest_ind, ttest_rel
from pathlib import Path
from src.data_loading_and_saving.print_and_save_results import print_and_save_result
class TTest:
"""
A class used to perform a T-test analysis.
...
Attributes
----------
print_result: bool
A flag used to indicate if the function should print the result to the standard output.
save_result: bool
A flag used to indicate if the function should save the result to a file.
filepath: str
The directory path where the result files will be saved if save_result is True.
Methods
-------
perform_ttest(data, first_group_name, second_group_name, name_save_file) -> None:
Performs an independent samples T-test for the two specified groups in the provided dataset.
perform_paired_ttest(data, first_group_name, second_group_name, name_save_file) -> tuple[float, float, float]:
Performs a paired samples T-test on the two specified groups in the provided dataset.
Returns the mean difference with the 95% confidence interval.
"""
def __init__(
self,
print_result: bool = True,
save_result: bool = True,
filepath: str = "results/",
):
"""
Initializes the TTest object with the provided parameters.
Parameters
----------
print_result: bool, optional
A flag to indicate if the function should print the result to the standard output. Default is True.
save_result: bool, optional
A flag to indicate if the function should save the result to a file. Default is True.
filepath: str, optional
The directory path where the result files will be saved if save_result is True. Default is "results/".
"""
self.print_result: bool = print_result
self.save_result: bool = save_result
self.filepath: str = filepath
if not os.path.isdir(filepath):
os.makedirs(filepath)
def perform_ttest(
self,
data: pd.DataFrame,
first_group_name: str,
second_group_name: str,
name_save_file: Path,
) -> None:
"""
This method performs a T-test for the means of two independent samples of scores using
the given columns and either prints or saves the result based on the TTest object properties.
Parameters
----------
data: pandas.DataFrame
The input dataframe which contains the data.
first_group_name: str
The name of the first column to be used in the t-test.
second_group_name: str
The name of the second column to be used in the t-test.
name_save_file: str
The name of the file to which the result will be saved (if self.save_result is True).
Returns
-------
None
"""
group_1: pd.DataFrame = data[first_group_name].dropna()
group_2: pd.DataFrame = data[second_group_name].dropna()
degrees_of_freedom: int = len(group_1) + len(group_2) - 2
t_statistic: float
p_value: float
t_statistic, p_value = ttest_ind(group_1, group_2)
mean_group_1: float = group_1.mean()
mean_group_2: float = group_2.mean()
standard_deviation_group_1: float = group_1.std()
standard_deviation_group_2: float = group_2.std()
ttest_summary: str = (
f"Mean of {first_group_name}: {mean_group_1}\n"
f"Mean of {second_group_name}: {mean_group_2}\n"
f"Standard Deviation of {first_group_name}: {standard_deviation_group_1}\n"
f"Standard Deviation of {second_group_name}: {standard_deviation_group_2}\n"
f"Degrees of Freedom: {degrees_of_freedom}\n"
f"T-statistic: {t_statistic}\n"
f"P-value: {p_value}"
)
print_and_save_result(
self.print_result,
self.save_result,
self.filepath,
ttest_summary,
name_save_file,
)
def perform_paired_ttest(
self,
data: pd.DataFrame,
first_group_name: str,
second_group_name: str,
name_save_file: Path,
) -> tuple[float, float, float]:
"""
Performs a paired sample t-test and calculates the effect size (Cohen's d) using the given columns
Parameters
----------
data : pd.DataFrame
The DataFrame containing the data of two related groups to be compared
first_group_name : str
The name of the first group (column)
second_group_name : str
the name of the second group (column)
name_save_file: Path
The path of the file to save the result in.
Returns
-------
mean_difference : float
The mean difference between the two samples
confidence_interval[0] : float
The lower bound of the 95% confidence interval
confidence_interval[1] : float
The upper bound of the 95% confidence interval
"""
data: pd.DataFrame = data[[first_group_name, second_group_name]].dropna()
group_1: pd.DataFrame = data[first_group_name]
group_2: pd.DataFrame = data[second_group_name]
degrees_of_freedom: int = len(group_1) - 1
t_statistic: float
p_value: float
t_statistic, p_value = ttest_rel(group_1, group_2)
mean_group_1: float = group_1.mean()
mean_group_2: float = group_2.mean()
standard_deviation_group_1: float = group_1.std()
standard_deviation_group_2: float = group_2.std()
pooled_standard_deviation: float = np.sqrt(
(standard_deviation_group_1**2 + standard_deviation_group_2**2) / 2
)
cohens_d: float = (mean_group_1 - mean_group_2) / pooled_standard_deviation
mean_difference: float = mean_group_1 - mean_group_2
standard_error_difference: float = np.std(group_1 - group_2, ddof=1) / np.sqrt(
len(group_1)
)
confidence_interval: np.ndarray[float] = stats.t.interval(
0.95, len(group_1) - 1, loc=mean_difference, scale=standard_error_difference
)
paired_ttest_summary: str = (
f"Mean of {first_group_name}: {mean_group_1}\n"
f"Mean of {second_group_name}: {mean_group_2}\n"
f"Standard Deviation of {first_group_name}: {standard_deviation_group_1}\n"
f"Standard Deviation of {second_group_name}: {standard_deviation_group_2}\n"
f"Degrees of Freedom: {degrees_of_freedom}\n"
f"Cohen's d: {cohens_d}\n"
f"T-statistic: {t_statistic}\n"
f"P-value: {p_value}"
)
print_and_save_result(
self.print_result,
self.save_result,
self.filepath,
paired_ttest_summary,
name_save_file,
)
return mean_difference, confidence_interval[0], confidence_interval[1]