196 lines
7.0 KiB
Python
196 lines
7.0 KiB
Python
|
import os
|
||
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
from scipy import stats as stats
|
||
|
from scipy.stats import ttest_ind, ttest_rel
|
||
|
from pathlib import Path
|
||
|
|
||
|
from src.data_loading_and_saving.print_and_save_results import print_and_save_result
|
||
|
|
||
|
|
||
|
class TTest:
|
||
|
"""
|
||
|
A class used to perform a T-test analysis.
|
||
|
|
||
|
...
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
print_result: bool
|
||
|
A flag used to indicate if the function should print the result to the standard output.
|
||
|
save_result: bool
|
||
|
A flag used to indicate if the function should save the result to a file.
|
||
|
filepath: str
|
||
|
The directory path where the result files will be saved if save_result is True.
|
||
|
|
||
|
Methods
|
||
|
-------
|
||
|
perform_ttest(data, first_group_name, second_group_name, name_save_file) -> None:
|
||
|
Performs an independent samples T-test for the two specified groups in the provided dataset.
|
||
|
perform_paired_ttest(data, first_group_name, second_group_name, name_save_file) -> tuple[float, float, float]:
|
||
|
Performs a paired samples T-test on the two specified groups in the provided dataset.
|
||
|
Returns the mean difference with the 95% confidence interval.
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
print_result: bool = True,
|
||
|
save_result: bool = True,
|
||
|
filepath: str = "results/",
|
||
|
):
|
||
|
"""
|
||
|
Initializes the TTest object with the provided parameters.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
print_result: bool, optional
|
||
|
A flag to indicate if the function should print the result to the standard output. Default is True.
|
||
|
save_result: bool, optional
|
||
|
A flag to indicate if the function should save the result to a file. Default is True.
|
||
|
filepath: str, optional
|
||
|
The directory path where the result files will be saved if save_result is True. Default is "results/".
|
||
|
"""
|
||
|
self.print_result: bool = print_result
|
||
|
self.save_result: bool = save_result
|
||
|
self.filepath: str = filepath
|
||
|
if not os.path.isdir(filepath):
|
||
|
os.makedirs(filepath)
|
||
|
|
||
|
def perform_ttest(
|
||
|
self,
|
||
|
data: pd.DataFrame,
|
||
|
first_group_name: str,
|
||
|
second_group_name: str,
|
||
|
name_save_file: Path,
|
||
|
) -> None:
|
||
|
"""
|
||
|
This method performs a T-test for the means of two independent samples of scores using
|
||
|
the given columns and either prints or saves the result based on the TTest object properties.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
data: pandas.DataFrame
|
||
|
The input dataframe which contains the data.
|
||
|
first_group_name: str
|
||
|
The name of the first column to be used in the t-test.
|
||
|
second_group_name: str
|
||
|
The name of the second column to be used in the t-test.
|
||
|
name_save_file: str
|
||
|
The name of the file to which the result will be saved (if self.save_result is True).
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
None
|
||
|
"""
|
||
|
group_1: pd.DataFrame = data[first_group_name].dropna()
|
||
|
group_2: pd.DataFrame = data[second_group_name].dropna()
|
||
|
|
||
|
degrees_of_freedom: int = len(group_1) + len(group_2) - 2
|
||
|
|
||
|
t_statistic: float
|
||
|
p_value: float
|
||
|
t_statistic, p_value = ttest_ind(group_1, group_2)
|
||
|
|
||
|
mean_group_1: float = group_1.mean()
|
||
|
mean_group_2: float = group_2.mean()
|
||
|
|
||
|
standard_deviation_group_1: float = group_1.std()
|
||
|
standard_deviation_group_2: float = group_2.std()
|
||
|
|
||
|
ttest_summary: str = (
|
||
|
f"Mean of {first_group_name}: {mean_group_1}\n"
|
||
|
f"Mean of {second_group_name}: {mean_group_2}\n"
|
||
|
f"Standard Deviation of {first_group_name}: {standard_deviation_group_1}\n"
|
||
|
f"Standard Deviation of {second_group_name}: {standard_deviation_group_2}\n"
|
||
|
f"Degrees of Freedom: {degrees_of_freedom}\n"
|
||
|
f"T-statistic: {t_statistic}\n"
|
||
|
f"P-value: {p_value}"
|
||
|
)
|
||
|
|
||
|
print_and_save_result(
|
||
|
self.print_result,
|
||
|
self.save_result,
|
||
|
self.filepath,
|
||
|
ttest_summary,
|
||
|
name_save_file,
|
||
|
)
|
||
|
|
||
|
def perform_paired_ttest(
|
||
|
self,
|
||
|
data: pd.DataFrame,
|
||
|
first_group_name: str,
|
||
|
second_group_name: str,
|
||
|
name_save_file: Path,
|
||
|
) -> tuple[float, float, float]:
|
||
|
"""
|
||
|
Performs a paired sample t-test and calculates the effect size (Cohen's d) using the given columns
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
data : pd.DataFrame
|
||
|
The DataFrame containing the data of two related groups to be compared
|
||
|
first_group_name : str
|
||
|
The name of the first group (column)
|
||
|
second_group_name : str
|
||
|
the name of the second group (column)
|
||
|
name_save_file: Path
|
||
|
The path of the file to save the result in.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
mean_difference : float
|
||
|
The mean difference between the two samples
|
||
|
confidence_interval[0] : float
|
||
|
The lower bound of the 95% confidence interval
|
||
|
confidence_interval[1] : float
|
||
|
The upper bound of the 95% confidence interval
|
||
|
"""
|
||
|
|
||
|
data: pd.DataFrame = data[[first_group_name, second_group_name]].dropna()
|
||
|
group_1: pd.DataFrame = data[first_group_name]
|
||
|
group_2: pd.DataFrame = data[second_group_name]
|
||
|
degrees_of_freedom: int = len(group_1) - 1
|
||
|
|
||
|
t_statistic: float
|
||
|
p_value: float
|
||
|
t_statistic, p_value = ttest_rel(group_1, group_2)
|
||
|
|
||
|
mean_group_1: float = group_1.mean()
|
||
|
mean_group_2: float = group_2.mean()
|
||
|
standard_deviation_group_1: float = group_1.std()
|
||
|
standard_deviation_group_2: float = group_2.std()
|
||
|
pooled_standard_deviation: float = np.sqrt(
|
||
|
(standard_deviation_group_1**2 + standard_deviation_group_2**2) / 2
|
||
|
)
|
||
|
cohens_d: float = (mean_group_1 - mean_group_2) / pooled_standard_deviation
|
||
|
|
||
|
mean_difference: float = mean_group_1 - mean_group_2
|
||
|
standard_error_difference: float = np.std(group_1 - group_2, ddof=1) / np.sqrt(
|
||
|
len(group_1)
|
||
|
)
|
||
|
|
||
|
confidence_interval: np.ndarray[float] = stats.t.interval(
|
||
|
0.95, len(group_1) - 1, loc=mean_difference, scale=standard_error_difference
|
||
|
)
|
||
|
|
||
|
paired_ttest_summary: str = (
|
||
|
f"Mean of {first_group_name}: {mean_group_1}\n"
|
||
|
f"Mean of {second_group_name}: {mean_group_2}\n"
|
||
|
f"Standard Deviation of {first_group_name}: {standard_deviation_group_1}\n"
|
||
|
f"Standard Deviation of {second_group_name}: {standard_deviation_group_2}\n"
|
||
|
f"Degrees of Freedom: {degrees_of_freedom}\n"
|
||
|
f"Cohen's d: {cohens_d}\n"
|
||
|
f"T-statistic: {t_statistic}\n"
|
||
|
f"P-value: {p_value}"
|
||
|
)
|
||
|
|
||
|
print_and_save_result(
|
||
|
self.print_result,
|
||
|
self.save_result,
|
||
|
self.filepath,
|
||
|
paired_ttest_summary,
|
||
|
name_save_file,
|
||
|
)
|
||
|
|
||
|
return mean_difference, confidence_interval[0], confidence_interval[1]
|