"""
Data processor for statistical analysis
"""
import numpy as np
import pandas as pd
from typing import Union, Optional, Dict, Any, Tuple
import warnings
[docs]
class DataProcessor:
"""
Simple class for loading and basic processing of numerical data.
This class provides basic methods to load and validate numerical data
for statistical analysis, converting data to numpy arrays.
"""
[docs]
def __init__(self, data: Union[np.ndarray, list, pd.Series, pd.DataFrame] = None):
"""
Initialize the data processor.
Parameters
----------
data : array-like, optional
Data to be processed. Can be:
- numpy array
- list of values
- pandas Series
- pandas DataFrame (will be flattened)
"""
self.data = None
self.metadata = {}
# Store original data to preserve datetime index for extremes analysis
self._original_data = None
# Internal adjuster for distribution fitting
self._adjuster = None
if data is not None:
self.load_data(data)
[docs]
def load_data(self, data: Union[np.ndarray, list, pd.Series, pd.DataFrame]) -> 'DataProcessor':
"""
Load data and convert to numpy array.
Parameters
----------
data : array-like
Data to be loaded. Can be:
- numpy array
- list of values
- pandas Series
- pandas DataFrame (will be flattened)
Returns
-------
DataProcessor
Processor instance with loaded data
"""
# Store original data before conversion
self._original_data = data
if isinstance(data, np.ndarray):
self.data = data.flatten() if data.ndim > 1 else data.copy()
elif isinstance(data, list):
self.data = np.array(data, dtype=float)
elif isinstance(data, pd.Series):
self.data = data.values
elif isinstance(data, pd.DataFrame):
self.data = data.values.flatten()
else:
# Try to convert any array-like object
try:
self.data = np.array(data, dtype=float).flatten()
except (ValueError, TypeError):
raise TypeError(f"Cannot convert data of type {type(data)} to numpy array.")
# Remove NaN values and warn if any were found
if np.any(np.isnan(self.data)):
n_nan = np.sum(np.isnan(self.data))
warnings.warn(f"Found {n_nan} NaN values. They will be removed.")
self.data = self.data[~np.isnan(self.data)]
if len(self.data) == 0:
raise ValueError("No valid data points after removing NaN values.")
self._update_metadata()
return self
[docs]
def get_data_array(self) -> np.ndarray:
"""
Return data as 1D numpy array.
Returns
-------
ndarray
1D numpy array with the data
"""
if self.data is None:
raise ValueError("No data has been loaded.")
return self.data.copy()
[docs]
def get_basic_stats(self) -> Dict[str, Any]:
"""
Return basic descriptive statistics of the data.
Returns
-------
dict
Dictionary with basic statistics
"""
if self.data is None:
raise ValueError("No data has been loaded.")
return {
'count': len(self.data),
'mean': np.mean(self.data),
'std': np.std(self.data, ddof=1), # Sample standard deviation
'var': np.var(self.data, ddof=1), # Sample variance
'min': np.min(self.data),
'max': np.max(self.data),
'median': np.median(self.data),
'q25': np.percentile(self.data, 25),
'q75': np.percentile(self.data, 75)
}
def _get_adjuster(self):
"""Get or create the internal adjuster."""
if self._adjuster is None:
from .magic_adjuster import MagicAdjuster
self._adjuster = MagicAdjuster(self)
return self._adjuster
[docs]
def fit_distribution(self, distribution: Union[str, object], **kwargs) -> 'DataProcessor':
"""
Fit a statistical distribution to the data.
Parameters
----------
distribution : str or scipy.stats distribution
Distribution to fit. Can be:
- String: 'weibull', 'gamma', 'lognorm', 'norm', etc.
- SciPy distribution object: stats.weibull_min, stats.gamma, etc.
**kwargs : dict
Additional arguments passed to the distribution's fit method
Returns
-------
DataProcessor
Processor instance with fitted distribution
"""
adjuster = self._get_adjuster()
adjuster.fit_distribution(distribution, **kwargs)
return self
[docs]
def get_fitted_params(self) -> Tuple:
"""
Get the fitted distribution parameters.
Returns
-------
tuple
Fitted parameters of the distribution
"""
if self._adjuster is None:
raise ValueError("No distribution has been fitted yet.")
return self._adjuster.get_fitted_params()
[docs]
def get_distribution_info(self) -> Dict[str, Any]:
"""
Get information about the fitted distribution.
Returns
-------
dict
Dictionary with distribution information
"""
if self._adjuster is None:
raise ValueError("No distribution has been fitted yet.")
return self._adjuster.get_distribution_info()
[docs]
def get_auto_fitter(self, candidates=None, criterion='rmse'):
"""
Create an AutoFitter instance for automatic distribution selection.
This method follows the factory pattern - it creates an AutoFitter
instance when needed, allowing automatic testing of multiple distributions
to find the best fit based on specified criteria.
Parameters
----------
candidates : list of str, optional
List of distribution names to test. If None, uses default set including:
'weibull_min', 'lognorm', 'gamma', 'norm', 'expon', 'rayleigh', 'chi2', 'beta'
criterion : str, default 'rmse'
Selection criterion ('rmse', 'aic', 'bic', 'ks_pvalue', 'chi2_pvalue')
Returns
-------
AutoFitter
AutoFitter instance configured with this data
Examples
--------
>>> import magica as ma
>>> import numpy as np
>>>
>>> # Load data
>>> data = np.random.weibull(2, 1000)
>>> processor = ma.read_data(data)
>>>
>>> # Get auto fitter and find best distribution
>>> auto_fitter = processor.get_auto_fitter()
>>> best_result = auto_fitter.fit_best_distribution()
>>> print(f"Best distribution: {best_result['distribution']}")
"""
# Import here to avoid circular imports
from .auto_fitter import AutoFitter
if self.data is None:
raise ValueError("No data has been loaded.")
return AutoFitter(self, candidates=candidates, criterion=criterion)
[docs]
def get_extremes_analyzer(
self,
times: Optional[Union[np.ndarray, pd.Series, pd.DatetimeIndex]] = None,
time_unit: str = 'years'
):
"""
Create an ExtremesAnalyzer instance for extreme value analysis.
This method provides access to return period and return value analysis
for extreme events in time series data.
Parameters
----------
times : array-like, optional
Time values corresponding to data points. Can be:
- pandas DatetimeIndex
- pandas Series with datetime values
- numpy array of datetime64
- numpy array of numeric values (e.g., years)
- None if data is pandas Series with datetime index
time_unit : str, default='years'
Unit for return period calculations.
Options: 'years', 'days', 'hours', 'months'
Returns
-------
ExtremesAnalyzer
ExtremesAnalyzer instance configured with this data
Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>> import magica as ma
>>>
>>> # Using pandas Series with datetime index
>>> dates = pd.date_range('2000-01-01', periods=1000, freq='D')
>>> values = np.random.weibull(2, 1000) * 10
>>> series = pd.Series(values, index=dates)
>>>
>>> processor = ma.read_data(series)
>>> extremes = processor.get_extremes_analyzer()
>>> extremes.fit_distribution('genextreme')
>>>
>>> # Calculate 100-year return value
>>> rv_100 = extremes.return_value(100)
>>>
>>> # Or provide times separately
>>> processor2 = ma.read_data(values)
>>> extremes2 = processor2.get_extremes_analyzer(times=dates)
"""
# Import here to avoid circular imports
from .extremes_analyzer import ExtremesAnalyzer
if self.data is None:
raise ValueError("No data has been loaded.")
return ExtremesAnalyzer(self, times=times, time_unit=time_unit)
def _update_metadata(self):
"""Update data metadata."""
if self.data is not None:
self.metadata['length'] = len(self.data)
self.metadata['dtype'] = str(self.data.dtype)
self.metadata['last_updated'] = pd.Timestamp.now()
[docs]
def __repr__(self) -> str:
"""String representation of the object."""
if self.data is None:
return "DataProcessor(no data loaded)"
# Check if distribution is fitted via adjuster
dist_info = ""
if self._adjuster is not None and self._adjuster.distribution_name is not None:
dist_info = f", distribution='{self._adjuster.distribution_name}'"
return f"DataProcessor(length={len(self.data)}, dtype={self.data.dtype}{dist_info})"
[docs]
def __len__(self) -> int:
"""Return the length of the data."""
if self.data is None:
return 0
return len(self.data)
[docs]
def __getattr__(self, name):
"""
Delegate method calls to the internal MagicAdjuster.
This allows direct access to all scipy.stats methods like:
cdf, pdf, ppf, sf, isf, rvs, stats, etc.
Parameters
----------
name : str
Name of the method/attribute to access
Returns
-------
Any
Method or attribute from the fitted distribution
"""
# First check if we have an adjuster with a fitted distribution
if self._adjuster is None:
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'. "
"Did you forget to call fit_distribution() first?")
# Delegate to the adjuster's __getattr__
try:
return getattr(self._adjuster, name)
except AttributeError:
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")