uncongeniality_analysis/src/analysis_functions/ttest.py

import os
import pandas as pd
import numpy as np
from scipy import stats as stats
from scipy.stats import ttest_ind, ttest_rel
from pathlib import Path

from src.data_loading_and_saving.print_and_save_results import print_and_save_result


class TTest:
    """
    A class used to perform a T-test analysis.

    ...

    Attributes
    ----------
    print_result: bool
        A flag used to indicate if the function should print the result to the standard output.
    save_result: bool
        A flag used to indicate if the function should save the result to a file.
    filepath: str
        The directory path where the result files will be saved if save_result is True.

    Methods
    -------
    perform_ttest(data, first_group_name, second_group_name, name_save_file) -> None:
        Performs an independent samples T-test for the two specified groups in the provided dataset.
    perform_paired_ttest(data, first_group_name, second_group_name, name_save_file) -> tuple[float, float, float]:
        Performs a paired samples T-test on the two specified groups in the provided dataset.
        Returns the mean difference with the 95% confidence interval.
    """

    def __init__(
        self,
        print_result: bool = True,
        save_result: bool = True,
        filepath: str = "results/",
    ):
        """
        Initializes the TTest object with the provided parameters.

        Parameters
        ----------
        print_result: bool, optional
            A flag to indicate if the function should print the result to the standard output. Default is True.
        save_result: bool, optional
            A flag to indicate if the function should save the result to a file. Default is True.
        filepath: str, optional
            The directory path where the result files will be saved if save_result is True. Default is "results/".
        """
        self.print_result: bool = print_result
        self.save_result: bool = save_result
        self.filepath: str = filepath
        if not os.path.isdir(filepath):
            os.makedirs(filepath)

    def perform_ttest(
        self,
        data: pd.DataFrame,
        first_group_name: str,
        second_group_name: str,
        name_save_file: Path,
    ) -> None:
        """
        This method performs a T-test for the means of two independent samples of scores using
        the given columns and either prints or saves the result based on the TTest object properties.

        Parameters
        ----------
        data: pandas.DataFrame
            The input dataframe which contains the data.
        first_group_name: str
            The name of the first column to be used in the t-test.
        second_group_name: str
            The name of the second column to be used in the t-test.
        name_save_file: str
            The name of the file to which the result will be saved (if self.save_result is True).

        Returns
        -------
        None
        """
        group_1: pd.DataFrame = data[first_group_name].dropna()
        group_2: pd.DataFrame = data[second_group_name].dropna()

        degrees_of_freedom: int = len(group_1) + len(group_2) - 2

        t_statistic: float
        p_value: float
        t_statistic, p_value = ttest_ind(group_1, group_2)

        mean_group_1: float = group_1.mean()
        mean_group_2: float = group_2.mean()

        standard_deviation_group_1: float = group_1.std()
        standard_deviation_group_2: float = group_2.std()

        ttest_summary: str = (
            f"Mean of {first_group_name}: {mean_group_1}\n"
            f"Mean of {second_group_name}: {mean_group_2}\n"
            f"Standard Deviation of {first_group_name}: {standard_deviation_group_1}\n"
            f"Standard Deviation of {second_group_name}: {standard_deviation_group_2}\n"
            f"Degrees of Freedom: {degrees_of_freedom}\n"
            f"T-statistic: {t_statistic}\n"
            f"P-value: {p_value}"
        )

        print_and_save_result(
            self.print_result,
            self.save_result,
            self.filepath,
            ttest_summary,
            name_save_file,
        )

    def perform_paired_ttest(
        self,
        data: pd.DataFrame,
        first_group_name: str,
        second_group_name: str,
        name_save_file: Path,
    ) -> tuple[float, float, float]:
        """
        Performs a paired sample t-test and calculates the effect size (Cohen's d) using the given columns

        Parameters
        ----------
        data : pd.DataFrame
            The DataFrame containing the data of two related groups to be compared
        first_group_name : str
            The name of the first group (column)
        second_group_name : str
            the name of the second group (column)
        name_save_file: Path
            The path of the file to save the result in.

        Returns
        -------
        mean_difference : float
            The mean difference between the two samples
        confidence_interval[0] : float
            The lower bound of the 95% confidence interval
        confidence_interval[1] : float
            The upper bound of the 95% confidence interval
        """

        data: pd.DataFrame = data[[first_group_name, second_group_name]].dropna()
        group_1: pd.DataFrame = data[first_group_name]
        group_2: pd.DataFrame = data[second_group_name]
        degrees_of_freedom: int = len(group_1) - 1

        t_statistic: float
        p_value: float
        t_statistic, p_value = ttest_rel(group_1, group_2)

        mean_group_1: float = group_1.mean()
        mean_group_2: float = group_2.mean()
        standard_deviation_group_1: float = group_1.std()
        standard_deviation_group_2: float = group_2.std()
        pooled_standard_deviation: float = np.sqrt(
            (standard_deviation_group_1**2 + standard_deviation_group_2**2) / 2
        )
        cohens_d: float = (mean_group_1 - mean_group_2) / pooled_standard_deviation

        mean_difference: float = mean_group_1 - mean_group_2
        standard_error_difference: float = np.std(group_1 - group_2, ddof=1) / np.sqrt(
            len(group_1)
        )

        confidence_interval: np.ndarray[float] = stats.t.interval(
            0.95, len(group_1) - 1, loc=mean_difference, scale=standard_error_difference
        )

        paired_ttest_summary: str = (
            f"Mean of {first_group_name}: {mean_group_1}\n"
            f"Mean of {second_group_name}: {mean_group_2}\n"
            f"Standard Deviation of {first_group_name}: {standard_deviation_group_1}\n"
            f"Standard Deviation of {second_group_name}: {standard_deviation_group_2}\n"
            f"Degrees of Freedom: {degrees_of_freedom}\n"
            f"Cohen's d: {cohens_d}\n"
            f"T-statistic: {t_statistic}\n"
            f"P-value: {p_value}"
        )

        print_and_save_result(
            self.print_result,
            self.save_result,
            self.filepath,
            paired_ttest_summary,
            name_save_file,
        )

        return mean_difference, confidence_interval[0], confidence_interval[1]
public repository of the uncongeniality_analysis. Initial publication 2024-07-22 09:51:10 +02:00			`import os`
			`import pandas as pd`
			`import numpy as np`
			`from scipy import stats as stats`
			`from scipy.stats import ttest_ind, ttest_rel`
			`from pathlib import Path`

			`from src.data_loading_and_saving.print_and_save_results import print_and_save_result`


			`class TTest:`
			`"""`
			`A class used to perform a T-test analysis.`

			`...`

			`Attributes`
			`----------`
			`print_result: bool`
			`A flag used to indicate if the function should print the result to the standard output.`
			`save_result: bool`
			`A flag used to indicate if the function should save the result to a file.`
			`filepath: str`
			`The directory path where the result files will be saved if save_result is True.`

			`Methods`
			`-------`
			`perform_ttest(data, first_group_name, second_group_name, name_save_file) -> None:`
			`Performs an independent samples T-test for the two specified groups in the provided dataset.`
			`perform_paired_ttest(data, first_group_name, second_group_name, name_save_file) -> tuple[float, float, float]:`
			`Performs a paired samples T-test on the two specified groups in the provided dataset.`
			`Returns the mean difference with the 95% confidence interval.`
			`"""`

			`def __init__(`
			`self,`
			`print_result: bool = True,`
			`save_result: bool = True,`
			`filepath: str = "results/",`
			`):`
			`"""`
			`Initializes the TTest object with the provided parameters.`

			`Parameters`
			`----------`
			`print_result: bool, optional`
			`A flag to indicate if the function should print the result to the standard output. Default is True.`
			`save_result: bool, optional`
			`A flag to indicate if the function should save the result to a file. Default is True.`
			`filepath: str, optional`
			`The directory path where the result files will be saved if save_result is True. Default is "results/".`
			`"""`
			`self.print_result: bool = print_result`
			`self.save_result: bool = save_result`
			`self.filepath: str = filepath`
			`if not os.path.isdir(filepath):`
			`os.makedirs(filepath)`

			`def perform_ttest(`
			`self,`
			`data: pd.DataFrame,`
			`first_group_name: str,`
			`second_group_name: str,`
			`name_save_file: Path,`
			`) -> None:`
			`"""`
			`This method performs a T-test for the means of two independent samples of scores using`
			`the given columns and either prints or saves the result based on the TTest object properties.`

			`Parameters`
			`----------`
			`data: pandas.DataFrame`
			`The input dataframe which contains the data.`
			`first_group_name: str`
			`The name of the first column to be used in the t-test.`
			`second_group_name: str`
			`The name of the second column to be used in the t-test.`
			`name_save_file: str`
			`The name of the file to which the result will be saved (if self.save_result is True).`

			`Returns`
			`-------`
			`None`
			`"""`
			`group_1: pd.DataFrame = data[first_group_name].dropna()`
			`group_2: pd.DataFrame = data[second_group_name].dropna()`

			`degrees_of_freedom: int = len(group_1) + len(group_2) - 2`

			`t_statistic: float`
			`p_value: float`
			`t_statistic, p_value = ttest_ind(group_1, group_2)`

			`mean_group_1: float = group_1.mean()`
			`mean_group_2: float = group_2.mean()`

			`standard_deviation_group_1: float = group_1.std()`
			`standard_deviation_group_2: float = group_2.std()`

			`ttest_summary: str = (`
			`f"Mean of {first_group_name}: {mean_group_1}\n"`
			`f"Mean of {second_group_name}: {mean_group_2}\n"`
			`f"Standard Deviation of {first_group_name}: {standard_deviation_group_1}\n"`
			`f"Standard Deviation of {second_group_name}: {standard_deviation_group_2}\n"`
			`f"Degrees of Freedom: {degrees_of_freedom}\n"`
			`f"T-statistic: {t_statistic}\n"`
			`f"P-value: {p_value}"`
			`)`

			`print_and_save_result(`
			`self.print_result,`
			`self.save_result,`
			`self.filepath,`
			`ttest_summary,`
			`name_save_file,`
			`)`

			`def perform_paired_ttest(`
			`self,`
			`data: pd.DataFrame,`
			`first_group_name: str,`
			`second_group_name: str,`
			`name_save_file: Path,`
			`) -> tuple[float, float, float]:`
			`"""`
			`Performs a paired sample t-test and calculates the effect size (Cohen's d) using the given columns`

			`Parameters`
			`----------`
			`data : pd.DataFrame`
			`The DataFrame containing the data of two related groups to be compared`
			`first_group_name : str`
			`The name of the first group (column)`
			`second_group_name : str`
			`the name of the second group (column)`
			`name_save_file: Path`
			`The path of the file to save the result in.`

			`Returns`
			`-------`
			`mean_difference : float`
			`The mean difference between the two samples`
			`confidence_interval[0] : float`
			`The lower bound of the 95% confidence interval`
			`confidence_interval[1] : float`
			`The upper bound of the 95% confidence interval`
			`"""`

			`data: pd.DataFrame = data[[first_group_name, second_group_name]].dropna()`
			`group_1: pd.DataFrame = data[first_group_name]`
			`group_2: pd.DataFrame = data[second_group_name]`
			`degrees_of_freedom: int = len(group_1) - 1`

			`t_statistic: float`
			`p_value: float`
			`t_statistic, p_value = ttest_rel(group_1, group_2)`

			`mean_group_1: float = group_1.mean()`
			`mean_group_2: float = group_2.mean()`
			`standard_deviation_group_1: float = group_1.std()`
			`standard_deviation_group_2: float = group_2.std()`
			`pooled_standard_deviation: float = np.sqrt(`
			`(standard_deviation_group_12 + standard_deviation_group_22) / 2`
			`)`
			`cohens_d: float = (mean_group_1 - mean_group_2) / pooled_standard_deviation`

			`mean_difference: float = mean_group_1 - mean_group_2`
			`standard_error_difference: float = np.std(group_1 - group_2, ddof=1) / np.sqrt(`
			`len(group_1)`
			`)`

			`confidence_interval: np.ndarray[float] = stats.t.interval(`
			`0.95, len(group_1) - 1, loc=mean_difference, scale=standard_error_difference`
			`)`

			`paired_ttest_summary: str = (`
			`f"Mean of {first_group_name}: {mean_group_1}\n"`
			`f"Mean of {second_group_name}: {mean_group_2}\n"`
			`f"Standard Deviation of {first_group_name}: {standard_deviation_group_1}\n"`
			`f"Standard Deviation of {second_group_name}: {standard_deviation_group_2}\n"`
			`f"Degrees of Freedom: {degrees_of_freedom}\n"`
			`f"Cohen's d: {cohens_d}\n"`
			`f"T-statistic: {t_statistic}\n"`
			`f"P-value: {p_value}"`
			`)`

			`print_and_save_result(`
			`self.print_result,`
			`self.save_result,`
			`self.filepath,`
			`paired_ttest_summary,`
			`name_save_file,`
			`)`

			`return mean_difference, confidence_interval[0], confidence_interval[1]`