55 lines
1.6 KiB
Python
55 lines
1.6 KiB
Python
"""
|
|
The `article_data_preprocessor.py` module serves as a preprocessing subclass, based on
|
|
the `DataPreprocessor` abstract base class, for dealing with article data.
|
|
|
|
"""
|
|
|
|
from pathlib import Path
|
|
import pandas as pd
|
|
|
|
from src.data_preprocessor import DataPreprocessor
|
|
from src.utils import load_json_file
|
|
|
|
|
|
class ArticleDataPreprocessor(DataPreprocessor):
|
|
"""
|
|
A class used to preprocess article data.
|
|
|
|
Attributes
|
|
----------
|
|
Inherits all attributes from the abstract base class DataPreprocessor
|
|
|
|
Methods
|
|
-------
|
|
preprocess(article_file_path: Path) -> pd.DataFrame
|
|
Preprocess an article data file and return it as a DataFrame.
|
|
"""
|
|
|
|
def __init__(self, data_folder_article_data: Path):
|
|
"""
|
|
Constructs the ArticleDataPreprocessor object.
|
|
|
|
Parameters
|
|
----------
|
|
data_folder_article_data : Path
|
|
Path object representing the directory where the article data files are located.
|
|
"""
|
|
super().__init__(data_folder_article_data, split_word="SPON_article")
|
|
|
|
def preprocess(self, article_file_path: Path) -> pd.DataFrame:
|
|
"""
|
|
Preprocess an article data file and return it as a DataFrame.
|
|
|
|
Parameters
|
|
----------
|
|
article_file_path : pathlib.Path
|
|
The path object representing the article data file to be preprocessed.
|
|
|
|
Returns
|
|
-------
|
|
pd.DataFrame
|
|
A pandas DataFrame containing the preprocessed article data.
|
|
"""
|
|
single_article_dict: dict = load_json_file(article_file_path)
|
|
return pd.DataFrame([single_article_dict])
|