preprocessing/tests/test_data_loader.py
2025-12-15 13:47:28 +01:00

383 lines
15 KiB
Python

import pytest
import pandas as pd
from unittest.mock import patch, mock_open
from src.utils.data_loader import (
load_yaml,
DataLoader,
load_questionnaire_scales,
assemble_wave_info,
)
class TestLoadYaml:
def test_yaml_file_loads_correctly(self):
yaml_content = "key1: value1\nkey2:\n nested: value2"
with patch("builtins.open", mock_open(read_data=yaml_content)):
result = load_yaml("test.yaml")
assert result == {"key1": "value1", "key2": {"nested": "value2"}}
def test_yaml_file_not_found_raises_exception(self):
with pytest.raises(FileNotFoundError):
load_yaml("nonexistent.yaml")
def test_yaml_file_with_empty_content(self):
yaml_content = ""
with patch("builtins.open", mock_open(read_data=yaml_content)):
result = load_yaml("test.yaml")
assert result is None
def test_yaml_file_with_invalid_syntax_raises_exception(self):
yaml_content = "invalid: yaml: content: ["
with patch("builtins.open", mock_open(read_data=yaml_content)):
with pytest.raises(Exception):
load_yaml("test.yaml")
class TestDataLoader:
def test_dataloader_initializes_with_all_waves_when_none_specified(self):
settings = {
"data_directory": "/data",
"data_file_for_each_wave": {1: "wave1.csv", 2: "wave2.csv"},
"config_file_for_each_wave": {1: "config1.yaml", 2: "config2.yaml"},
}
loader = DataLoader(settings)
assert loader.waves_to_process == settings["data_file_for_each_wave"].keys()
def test_dataloader_initializes_with_specified_waves(self):
settings = {
"data_directory": "/data",
"data_file_for_each_wave": {1: "wave1.csv", 2: "wave2.csv", 3: "wave3.csv"},
"config_file_for_each_wave": {
1: "config1.yaml",
2: "config2.yaml",
3: "config3.yaml",
},
}
loader = DataLoader(settings, [1, 3])
assert loader.waves_to_process == [1, 3]
def test_dataloader_stores_settings_correctly(self):
settings = {
"data_directory": "/test/data",
"data_file_for_each_wave": {1: "test.csv"},
"config_file_for_each_wave": {1: "test.yaml"},
}
loader = DataLoader(settings)
assert loader.data_directory == "/test/data"
assert loader.data_file_for_each_wave == {1: "test.csv"}
assert loader.config_file_for_each_wave == {1: "test.yaml"}
@patch("pandas.read_csv")
def test_dataloader_loads_survey_data_for_specified_waves(self, mock_read_csv):
import os
mock_df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
mock_read_csv.return_value = mock_df
settings = {
"data_directory": "/data",
"data_file_for_each_wave": {1: "wave1.csv", 2: "wave2.csv"},
"config_file_for_each_wave": {1: "config1.yaml", 2: "config2.yaml"},
}
loader = DataLoader(settings, [1])
result = loader.load_all_survey_data()
assert 1 in result
assert "data" in result[1]
assert "config_path" in result[1]
assert result[1]["config_path"] == "config1.yaml"
expected_path = os.path.join("/data", "wave1.csv")
mock_read_csv.assert_called_once_with(expected_path)
@patch("pandas.read_csv")
def test_dataloader_loads_multiple_waves(self, mock_read_csv):
mock_df1 = pd.DataFrame({"wave1_col": [1, 2]})
mock_df2 = pd.DataFrame({"wave2_col": [3, 4]})
mock_read_csv.side_effect = [mock_df1, mock_df2]
settings = {
"data_directory": "/data",
"data_file_for_each_wave": {1: "wave1.csv", 2: "wave2.csv"},
"config_file_for_each_wave": {1: "config1.yaml", 2: "config2.yaml"},
}
loader = DataLoader(settings, [1, 2])
result = loader.load_all_survey_data()
assert len(result) == 2
assert 1 in result and 2 in result
assert result[1]["config_path"] == "config1.yaml"
assert result[2]["config_path"] == "config2.yaml"
@patch("pandas.read_csv")
def test_dataloader_handles_csv_read_error(self, mock_read_csv):
mock_read_csv.side_effect = FileNotFoundError("CSV file not found")
settings = {
"data_directory": "/data",
"data_file_for_each_wave": {1: "nonexistent.csv"},
"config_file_for_each_wave": {1: "config1.yaml"},
}
loader = DataLoader(settings, [1])
with pytest.raises(FileNotFoundError):
loader.load_all_survey_data()
class TestLoadQuestionnaireScales:
def test_questionnaire_scales_loads_from_valid_yaml(self):
yaml_content = """
scales:
- name: scale1
items: [item1, item2]
- name: scale2
items: [item3, item4]
"""
with patch("builtins.open", mock_open(read_data=yaml_content)):
with patch("yaml.safe_load") as mock_yaml:
mock_yaml.return_value = {
"scales": [
{"name": "scale1", "items": ["item1", "item2"]},
{"name": "scale2", "items": ["item3", "item4"]},
]
}
result = load_questionnaire_scales("test.yaml", questionnaire_name="q1")
assert "scale1" in result
assert "scale2" in result
assert result["scale1"]["items"] == ["item1", "item2"]
def test_questionnaire_scales_handles_empty_scales_list(self):
yaml_content = """
scales: []
"""
with patch("builtins.open", mock_open(read_data=yaml_content)):
with patch("yaml.safe_load") as mock_yaml:
mock_yaml.return_value = {"scales": []}
result = load_questionnaire_scales("test.yaml", questionnaire_name="q1")
assert result == {}
def test_questionnaire_scales_loads_complex_structure(self):
with patch("builtins.open", mock_open()):
with patch("yaml.safe_load") as mock_yaml:
mock_yaml.return_value = {
"questionnaire": "test_questionnaire",
"scales": [
{
"name": "choice_favorite_ai_user",
"label": "Choice of favorite AI system",
"calculation": "categorical",
"response_options": {"1": "ChatGPT", "2": "Claude"},
"output": "choice_favorite_ai_user",
}
],
}
result = load_questionnaire_scales("test.yaml", questionnaire_name="q1")
assert "choice_favorite_ai_user" in result
assert result["choice_favorite_ai_user"]["calculation"] == "categorical"
def test_questionnaire_scales_handles_missing_scales_key(self):
with patch("builtins.open", mock_open()):
with patch("yaml.safe_load") as mock_yaml:
mock_yaml.return_value = {"questionnaire": "test"}
with pytest.raises(KeyError):
load_questionnaire_scales("test.yaml", questionnaire_name="q1")
class TestAssembleWaveInfo:
@patch("src.utils.data_loader.load_yaml")
@patch("src.utils.data_loader.load_questionnaire_scales")
@patch("os.path.isabs")
@patch("os.path.normpath")
@patch("os.path.join")
def test_wave_info_assembles_with_absolute_questionnaire_paths(
self, mock_join, mock_normpath, mock_isabs, mock_load_scales, mock_load_yaml
):
mock_isabs.return_value = True
mock_load_yaml.return_value = {
"questionnaires": [{"name": "q1", "path": "/absolute/path/q1.yaml"}]
}
mock_load_scales.return_value = {
"scale1": {"items": ["item1", "item2"], "questionnaire": "q1"},
"scale2": {"items": ["item3", "item4"], "questionnaire": "q1"},
}
settings = {"questionnaire_directory": "/base"}
result = assemble_wave_info("wave_config.yaml", settings)
mock_load_scales.assert_called_once_with("/absolute/path/q1.yaml", "q1")
assert "scale1" in result[0]
@patch("src.utils.data_loader.load_yaml")
@patch("src.utils.data_loader.load_questionnaire_scales")
@patch("os.path.isabs")
@patch("os.path.normpath")
@patch("os.path.join")
def test_wave_info_assembles_with_relative_questionnaire_paths(
self, mock_join, mock_normpath, mock_isabs, mock_load_scales, mock_load_yaml
):
mock_isabs.return_value = False
mock_join.return_value = "/base/relative/q1.yaml"
mock_normpath.return_value = "/base/relative/q1.yaml"
mock_load_yaml.return_value = {
"questionnaires": [{"name": "q1", "path": "relative/q1.yaml"}]
}
mock_load_scales.return_value = {"scale1": {"questionnaire": "q1"}}
settings = {"questionnaire_directory": "/base"}
result = assemble_wave_info("wave_config.yaml", settings)
assert "scale1" in result[0]
mock_join.assert_called_once_with("/base", "relative/q1.yaml")
mock_load_scales.assert_called_once_with("/base/relative/q1.yaml", "q1")
@patch("src.utils.data_loader.load_yaml")
@patch("src.utils.data_loader.load_questionnaire_scales")
@patch("logging.info")
def test_wave_info_assigns_all_subgroup_to_scales_without_subgroup(
self, mock_log, mock_load_scales, mock_load_yaml
):
mock_load_yaml.return_value = {
"questionnaires": [{"name": "q1", "path": "/path/q1.yaml"}]
}
mock_load_scales.return_value = {"scale1": {"questionnaire": "q1"}}
settings = {"questionnaire_directory": "/base"}
result = assemble_wave_info("wave_config.yaml", settings)
assert result[1]["scale1"] == "all"
mock_log.assert_called_once()
@patch("src.utils.data_loader.load_yaml")
@patch("src.utils.data_loader.load_questionnaire_scales")
def test_wave_info_handles_subgroup_scales_by_questionnaire_name(
self, mock_load_scales, mock_load_yaml
):
mock_load_yaml.return_value = {
"questionnaires": [{"name": "q1", "path": "/path/q1.yaml"}],
"subgroup_scales": {"q1": "group1"},
}
mock_load_scales.return_value = {
"scale1": {"questionnaire": "q1"},
"scale2": {"questionnaire": "q1"},
}
settings = {"questionnaire_directory": "/base"}
result = assemble_wave_info("wave_config.yaml", settings)
assert result[1]["scale1"] == "group1"
assert result[1]["scale2"] == "group1"
@patch("src.utils.data_loader.load_yaml")
@patch("src.utils.data_loader.load_questionnaire_scales")
def test_wave_info_handles_subgroup_scales_by_scale_name(
self, mock_load_scales, mock_load_yaml
):
mock_load_yaml.return_value = {
"questionnaires": [{"name": "q1", "path": "/path/q1.yaml"}],
"subgroup_scales": {"scale1": "specific_group"},
}
mock_load_scales.return_value = {
"scale1": {"questionnaire": "q1"},
"scale2": {"questionnaire": "q1"},
}
settings = {"questionnaire_directory": "/base"}
result = assemble_wave_info("wave_config.yaml", settings)
assert result[1]["scale1"] == "specific_group"
@patch("src.utils.data_loader.load_yaml")
@patch("src.utils.data_loader.load_questionnaire_scales")
def test_wave_info_raises_error_for_invalid_subgroup_entry(
self, mock_load_scales, mock_load_yaml
):
mock_load_yaml.return_value = {
"questionnaires": [{"name": "q1", "path": "/path/q1.yaml"}],
"subgroup_scales": {"nonexistent": "group1"},
}
mock_load_scales.return_value = {"scale1": {"questionnaire": "q1"}}
settings = {"questionnaire_directory": "/base"}
with pytest.raises(
ValueError,
match="Entry 'nonexistent' in subgroup_scales is not a loaded scale or questionnaire name",
):
assemble_wave_info("wave_config.yaml", settings)
@patch("src.utils.data_loader.load_yaml")
@patch("src.utils.data_loader.load_questionnaire_scales")
def test_wave_info_returns_composite_scales_when_present(
self, mock_load_scales, mock_load_yaml
):
mock_load_yaml.return_value = {
"questionnaires": [{"name": "q1", "path": "/path/q1.yaml"}],
"composite_scales": {"composite1": {"items": ["scale1", "scale2"]}},
}
mock_load_scales.return_value = {"scale1": {"questionnaire": "q1"}}
settings = {"questionnaire_directory": "/base"}
result = assemble_wave_info("wave_config.yaml", settings)
assert "composite1" in result[3]
assert result[3]["composite1"]["items"] == ["scale1", "scale2"]
@patch("src.utils.data_loader.load_yaml")
@patch("src.utils.data_loader.load_questionnaire_scales")
def test_wave_info_returns_empty_composite_scales_when_absent(
self, mock_load_scales, mock_load_yaml
):
mock_load_yaml.return_value = {
"questionnaires": [{"name": "q1", "path": "/path/q1.yaml"}]
}
mock_load_scales.return_value = {"scale1": {"questionnaire": "q1"}}
settings = {"questionnaire_directory": "/base"}
result = assemble_wave_info("wave_config.yaml", settings)
assert result[3] == {}
@patch("src.utils.data_loader.load_yaml")
@patch("src.utils.data_loader.load_questionnaire_scales")
def test_wave_info_handles_multiple_questionnaires(
self, mock_load_scales, mock_load_yaml
):
mock_load_yaml.return_value = {
"questionnaires": [
{"name": "q1", "path": "/path/q1.yaml"},
{"name": "q2", "path": "/path/q2.yaml"},
]
}
mock_load_scales.side_effect = [
{"scale1": {"questionnaire": "q1"}},
{"scale2": {"questionnaire": "q2"}},
]
settings = {"questionnaire_directory": "/base"}
result = assemble_wave_info("wave_config.yaml", settings)
assert "scale1" in result[0]
assert "scale2" in result[0]
assert len(result[0]) == 2
@patch("src.utils.data_loader.load_yaml")
@patch("src.utils.data_loader.load_questionnaire_scales")
def test_wave_info_returns_correct_tuple_structure(
self, mock_load_scales, mock_load_yaml
):
mock_load_yaml.return_value = {
"questionnaires": [{"name": "q1", "path": "/path/q1.yaml"}]
}
mock_load_scales.return_value = {"scale1": {"questionnaire": "q1"}}
settings = {"questionnaire_directory": "/base"}
result = assemble_wave_info("wave_config.yaml", settings)
assert isinstance(result, tuple)
assert len(result) == 4
assert isinstance(result[0], dict) # scale_dictionary
assert isinstance(result[1], dict) # final_subgroup_scales
assert isinstance(result[2], set) # excluded_scales
assert isinstance(result[3], dict) # composite_scales