import os import pandas as pd import numpy as np from scipy import stats as stats from scipy.stats import ttest_ind, ttest_rel from pathlib import Path from src.data_loading_and_saving.print_and_save_results import print_and_save_result class TTest: """ A class used to perform a T-test analysis. ... Attributes ---------- print_result: bool A flag used to indicate if the function should print the result to the standard output. save_result: bool A flag used to indicate if the function should save the result to a file. filepath: str The directory path where the result files will be saved if save_result is True. Methods ------- perform_ttest(data, first_group_name, second_group_name, name_save_file) -> None: Performs an independent samples T-test for the two specified groups in the provided dataset. perform_paired_ttest(data, first_group_name, second_group_name, name_save_file) -> tuple[float, float, float]: Performs a paired samples T-test on the two specified groups in the provided dataset. Returns the mean difference with the 95% confidence interval. """ def __init__( self, print_result: bool = True, save_result: bool = True, filepath: str = "results/", ): """ Initializes the TTest object with the provided parameters. Parameters ---------- print_result: bool, optional A flag to indicate if the function should print the result to the standard output. Default is True. save_result: bool, optional A flag to indicate if the function should save the result to a file. Default is True. filepath: str, optional The directory path where the result files will be saved if save_result is True. Default is "results/". """ self.print_result: bool = print_result self.save_result: bool = save_result self.filepath: str = filepath if not os.path.isdir(filepath): os.makedirs(filepath) def perform_ttest( self, data: pd.DataFrame, first_group_name: str, second_group_name: str, name_save_file: Path, ) -> None: """ This method performs a T-test for the means of two independent samples of scores using the given columns and either prints or saves the result based on the TTest object properties. Parameters ---------- data: pandas.DataFrame The input dataframe which contains the data. first_group_name: str The name of the first column to be used in the t-test. second_group_name: str The name of the second column to be used in the t-test. name_save_file: str The name of the file to which the result will be saved (if self.save_result is True). Returns ------- None """ group_1: pd.DataFrame = data[first_group_name].dropna() group_2: pd.DataFrame = data[second_group_name].dropna() degrees_of_freedom: int = len(group_1) + len(group_2) - 2 t_statistic: float p_value: float t_statistic, p_value = ttest_ind(group_1, group_2) mean_group_1: float = group_1.mean() mean_group_2: float = group_2.mean() standard_deviation_group_1: float = group_1.std() standard_deviation_group_2: float = group_2.std() ttest_summary: str = ( f"Mean of {first_group_name}: {mean_group_1}\n" f"Mean of {second_group_name}: {mean_group_2}\n" f"Standard Deviation of {first_group_name}: {standard_deviation_group_1}\n" f"Standard Deviation of {second_group_name}: {standard_deviation_group_2}\n" f"Degrees of Freedom: {degrees_of_freedom}\n" f"T-statistic: {t_statistic}\n" f"P-value: {p_value}" ) print_and_save_result( self.print_result, self.save_result, self.filepath, ttest_summary, name_save_file, ) def perform_paired_ttest( self, data: pd.DataFrame, first_group_name: str, second_group_name: str, name_save_file: Path, ) -> tuple[float, float, float]: """ Performs a paired sample t-test and calculates the effect size (Cohen's d) using the given columns Parameters ---------- data : pd.DataFrame The DataFrame containing the data of two related groups to be compared first_group_name : str The name of the first group (column) second_group_name : str the name of the second group (column) name_save_file: Path The path of the file to save the result in. Returns ------- mean_difference : float The mean difference between the two samples confidence_interval[0] : float The lower bound of the 95% confidence interval confidence_interval[1] : float The upper bound of the 95% confidence interval """ data: pd.DataFrame = data[[first_group_name, second_group_name]].dropna() group_1: pd.DataFrame = data[first_group_name] group_2: pd.DataFrame = data[second_group_name] degrees_of_freedom: int = len(group_1) - 1 t_statistic: float p_value: float t_statistic, p_value = ttest_rel(group_1, group_2) mean_group_1: float = group_1.mean() mean_group_2: float = group_2.mean() standard_deviation_group_1: float = group_1.std() standard_deviation_group_2: float = group_2.std() pooled_standard_deviation: float = np.sqrt( (standard_deviation_group_1**2 + standard_deviation_group_2**2) / 2 ) cohens_d: float = (mean_group_1 - mean_group_2) / pooled_standard_deviation mean_difference: float = mean_group_1 - mean_group_2 standard_error_difference: float = np.std(group_1 - group_2, ddof=1) / np.sqrt( len(group_1) ) confidence_interval: np.ndarray[float] = stats.t.interval( 0.95, len(group_1) - 1, loc=mean_difference, scale=standard_error_difference ) paired_ttest_summary: str = ( f"Mean of {first_group_name}: {mean_group_1}\n" f"Mean of {second_group_name}: {mean_group_2}\n" f"Standard Deviation of {first_group_name}: {standard_deviation_group_1}\n" f"Standard Deviation of {second_group_name}: {standard_deviation_group_2}\n" f"Degrees of Freedom: {degrees_of_freedom}\n" f"Cohen's d: {cohens_d}\n" f"T-statistic: {t_statistic}\n" f"P-value: {p_value}" ) print_and_save_result( self.print_result, self.save_result, self.filepath, paired_ttest_summary, name_save_file, ) return mean_difference, confidence_interval[0], confidence_interval[1]