preprocessing/tests/test_scale_processor.py
2025-12-15 13:47:28 +01:00

434 lines
14 KiB
Python

import pytest
import pandas as pd
import numpy as np
from src.scale_processor import ScaleProcessor
class TestScaleProcessor:
@staticmethod
def initializes_with_basic_scale_config():
config = {"name": "test_scale", "items": [{"id": "item1"}, {"id": "item2"}]}
processor = ScaleProcessor(config)
assert processor.name == "test_scale"
assert processor.items == [{"id": "item1"}, {"id": "item2"}]
assert processor.calculation == "mean"
assert processor.score_min == 1
assert processor.score_max == 5
assert processor.output == "test_scale"
assert processor.subgroup is None
@staticmethod
def initializes_with_custom_configuration():
config = {
"name": "custom_scale",
"items": [{"id": "q1"}],
"calculation": "sum",
"score_range": (0, 10),
"response_options": {"1": "Yes", "2": "No"},
"output": "custom_output",
}
processor = ScaleProcessor(config, "group1")
assert processor.calculation == "sum"
assert processor.score_min == 0
assert processor.score_max == 10
assert processor.response_options == {"1": "Yes", "2": "No"}
assert processor.output == "custom_output"
assert processor.subgroup == "group1"
@staticmethod
def check_items_passes_when_all_columns_present():
config = {"name": "test", "items": [{"id": "col1"}, {"id": "col2"}]}
processor = ScaleProcessor(config)
df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]})
processor.check_items(df)
@staticmethod
def check_items_raises_error_when_columns_missing():
config = {"name": "test", "items": [{"id": "col1"}, {"id": "missing"}]}
processor = ScaleProcessor(config)
df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
with pytest.raises(
ValueError, match="Missing columns in data: \\['missing'\\]"
):
processor.check_items(df)
@staticmethod
def get_subgroup_mask_returns_all_true_when_no_subgroup():
config = {"name": "test", "items": [{"id": "col1"}]}
processor = ScaleProcessor(config)
df = pd.DataFrame({"col1": [1, 2, 3]})
mask = processor.get_subgroup_mask(df)
assert mask.all()
assert len(mask) == 3
@staticmethod
def get_subgroup_mask_returns_all_true_when_subgroup_is_all():
config = {"name": "test", "items": [{"id": "col1"}]}
processor = ScaleProcessor(config, "all")
df = pd.DataFrame({"col1": [1, 2, 3]})
mask = processor.get_subgroup_mask(df)
assert mask.all()
@staticmethod
def get_subgroup_mask_filters_by_subgroup_column():
config = {"name": "test", "items": [{"id": "col1"}]}
processor = ScaleProcessor(config, "group")
df = pd.DataFrame({"col1": [1, 2, 3], "group": [True, False, True]})
mask = processor.get_subgroup_mask(df)
assert mask.iloc[0] is True
assert mask.iloc[1] is False
assert mask.iloc[2] is True
@staticmethod
def get_subgroup_mask_returns_all_true_when_subgroup_column_missing():
config = {"name": "test", "items": [{"id": "col1"}]}
processor = ScaleProcessor(config, "nonexistent")
df = pd.DataFrame({"col1": [1, 2, 3]})
mask = processor.get_subgroup_mask(df)
assert mask.all()
@staticmethod
def process_calculates_mean_by_default():
config = {"name": "test", "items": [{"id": "q1"}, {"id": "q2"}]}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [2, 4, 6], "q2": [4, 6, 8]})
result = processor.process(df)
assert result.columns[0] == "test"
assert result["test"].iloc[0] == 3.0
assert result["test"].iloc[1] == 5.0
assert result["test"].iloc[2] == 7.0
@staticmethod
def process_calculates_sum_when_specified():
config = {
"name": "sum_scale",
"items": [{"id": "q1"}, {"id": "q2"}],
"calculation": "sum",
}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [1, 2, 3], "q2": [4, 5, 6]})
result = processor.process(df)
assert result["sum_scale"].iloc[0] == 5
assert result["sum_scale"].iloc[1] == 7
assert result["sum_scale"].iloc[2] == 9
@staticmethod
def process_handles_item_inversion():
config = {
"name": "inverted",
"items": [{"id": "q1", "inverse": True}, {"id": "q2"}],
"score_range": (1, 5),
}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [1, 5], "q2": [3, 3]})
result = processor.process(df)
assert result["inverted"].iloc[0] == 4.0 # (5+1-1+3)/2 = 4
assert result["inverted"].iloc[1] == 2.0 # (5+1-5+3)/2 = 2
@staticmethod
def process_handles_categorical_calculation_single_item():
config = {
"name": "category",
"items": [{"id": "q1"}],
"calculation": "categorical",
"response_options": {"1": "Option A", "2": "Option B", "3": "Option C"},
}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [1, 2, 3, 1]})
result = processor.process(df)
assert result["category"].iloc[0] == "Option A"
assert result["category"].iloc[1] == "Option B"
assert result["category"].iloc[2] == "Option C"
assert result["category"].iloc[3] == "Option A"
@staticmethod
def process_raises_error_for_categorical_with_multiple_items():
config = {
"name": "category",
"items": [{"id": "q1"}, {"id": "q2"}],
"calculation": "categorical",
}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [1, 2], "q2": [1, 2]})
with pytest.raises(
ValueError, match="calculation 'categorical' is only for single-item scales"
):
processor.process(df)
@staticmethod
def process_handles_categorical_with_open_ended_other_option():
config = {
"name": "category",
"items": [{"id": "q1", "open_ended_id": "q1_other"}],
"calculation": "categorical",
"response_options": {"1": "Option A", "10": "Other"},
}
processor = ScaleProcessor(config)
df = pd.DataFrame(
{
"q1": [1, 10, 1, 10],
"q1_other": ["", "Custom text", "", "Another custom"],
}
)
result = processor.process(df)
assert result["category"].iloc[0] == "Option A"
assert result["category"].iloc[1] == "Other"
assert pd.isna(result["category_other_text"].iloc[0])
assert result["category_other_text"].iloc[1] == "Custom text"
assert pd.isna(result["category_other_text"].iloc[2])
assert result["category_other_text"].iloc[3] == "Another custom"
@staticmethod
def process_handles_ordinal_calculation_single_item():
config = {
"name": "ordinal",
"items": [{"id": "q1"}],
"calculation": "ordinal",
"response_options": {1: "Low", 2: "Medium", 3: "High"},
}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [1, 2, 3, 2]})
result = processor.process(df)
assert result["ordinal"].iloc[0] == "Low"
assert result["ordinal"].iloc[1] == "Medium"
assert result["ordinal"].iloc[2] == "High"
assert result["ordinal"].iloc[3] == "Medium"
@staticmethod
def process_raises_error_for_ordinal_with_multiple_items():
config = {
"name": "ordinal",
"items": [{"id": "q1"}, {"id": "q2"}],
"calculation": "ordinal",
}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [1, 2], "q2": [1, 2]})
with pytest.raises(
ValueError,
match="calculation 'ordinal' only allowed with single-item scales",
):
processor.process(df)
@staticmethod
def process_handles_response_calculation_single_item():
config = {
"name": "response",
"items": [{"id": "q1"}],
"calculation": "response",
}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [1.5, 2.7, 3.9]})
result = processor.process(df)
assert result["response"].iloc[0] == 1.5
assert result["response"].iloc[1] == 2.7
assert result["response"].iloc[2] == 3.9
@staticmethod
def process_raises_error_for_response_with_multiple_items():
config = {
"name": "response",
"items": [{"id": "q1"}, {"id": "q2"}],
"calculation": "response",
}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [1, 2], "q2": [1, 2]})
with pytest.raises(
ValueError,
match="calculation 'response' can only be used with single-item scales!",
):
processor.process(df)
@staticmethod
def process_handles_sum_correct_calculation():
config = {
"name": "correct_sum",
"items": [
{"id": "q1", "correct": 2},
{"id": "q2", "correct": 1},
{"id": "q3", "correct": 3},
],
"calculation": "sum_correct",
}
processor = ScaleProcessor(config)
df = pd.DataFrame(
{
"q1": [2, 1, 2], # correct, wrong, correct
"q2": [1, 1, 2], # correct, correct, wrong
"q3": [3, 2, 3], # correct, wrong, correct
}
)
result = processor.process(df)
assert result["correct_sum"].iloc[0] == 3 # all correct
assert result["correct_sum"].iloc[1] == 1 # one correct
assert result["correct_sum"].iloc[2] == 2 # two correct
@staticmethod
def process_handles_mean_correct_calculation():
config = {
"name": "correct_mean",
"items": [{"id": "q1", "correct": 1}, {"id": "q2", "correct": 2}],
"calculation": "mean_correct",
}
processor = ScaleProcessor(config)
df = pd.DataFrame(
{
"q1": [1, 1, 2], # correct, correct, wrong
"q2": [2, 1, 2], # correct, wrong, correct
}
)
result = processor.process(df)
assert result["correct_mean"].iloc[0] == 1.0 # 2/2 = 1.0
assert result["correct_mean"].iloc[1] == 0.5 # 1/2 = 0.5
assert result["correct_mean"].iloc[2] == 0.5 # 1/2 = 0.5
@staticmethod
def process_raises_error_for_unknown_correct_calculation():
config = {
"name": "test",
"items": [{"id": "q1", "correct": 1}],
"calculation": "unknown_correct",
}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [1, 2]})
with pytest.raises(
ValueError, match="Unknown calculation for objective items: unknown_correct"
):
processor.process(df)
@staticmethod
def process_raises_error_for_unknown_calculation_type():
config = {"name": "test", "items": [{"id": "q1"}], "calculation": "unknown"}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [1, 2]})
with pytest.raises(ValueError, match="Unknown calculation: unknown"):
processor.process(df)
@staticmethod
def process_applies_subgroup_filtering():
config = {
"name": "filtered",
"items": [{"id": "q1"}],
"calculation": "response",
}
processor = ScaleProcessor(config, "group")
df = pd.DataFrame({"q1": [10, 20, 30], "group": [True, False, True]})
result = processor.process(df)
assert result["filtered"].iloc[0] == 10
assert pd.isna(result["filtered"].iloc[1])
assert result["filtered"].iloc[2] == 30
@staticmethod
def process_handles_missing_values_in_mean_calculation():
config = {"name": "with_na", "items": [{"id": "q1"}, {"id": "q2"}]}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [1, np.nan, 3], "q2": [2, 4, np.nan]})
result = processor.process(df)
assert result["with_na"].iloc[0] == 1.5 # (1+2)/2
assert result["with_na"].iloc[1] == 4.0 # only q2 value
assert result["with_na"].iloc[2] == 3.0 # only q1 value
@staticmethod
def process_handles_missing_values_in_categorical_calculation():
config = {
"name": "category_na",
"items": [{"id": "q1"}],
"calculation": "categorical",
"response_options": {"1": "Yes", "2": "No"},
}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [1, np.nan, 2]})
result = processor.process(df)
assert result["category_na"].iloc[0] == "Yes"
assert pd.isna(result["category_na"].iloc[1])
assert result["category_na"].iloc[2] == "No"
@staticmethod
def process_uses_custom_output_name():
config = {
"name": "original_name",
"items": [{"id": "q1"}],
"output": "custom_output",
}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [1, 2, 3]})
result = processor.process(df)
assert "custom_output" in result.columns
assert "original_name" not in result.columns
@staticmethod
def process_raises_error_for_ordinal_without_response_options_dict():
config = {
"name": "ordinal",
"items": [{"id": "q1"}],
"calculation": "ordinal",
"response_options": ["Not a dict"],
}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [1, 2]})
with pytest.raises(
ValueError,
match="For calculation 'ordinal', response_options must be a dict mapping",
):
processor.process(df)
@staticmethod
def process_raises_error_for_categorical_without_response_options_dict():
config = {
"name": "categorical",
"items": [{"id": "q1"}],
"calculation": "categorical",
"response_options": "Not a dict",
}
processor = ScaleProcessor(config)
df = pd.DataFrame({"q1": [1, 2]})
with pytest.raises(
ValueError,
match="response_options must be a dict for calculation 'categorical'",
):
processor.process(df)