Source code for magica.core.data_processor

"""
Data processor for statistical analysis
"""

import numpy as np
import pandas as pd
from typing import Union, Optional, Dict, Any, Tuple
import warnings


[docs] class DataProcessor: """ Simple class for loading and basic processing of numerical data. This class provides basic methods to load and validate numerical data for statistical analysis, converting data to numpy arrays. """
[docs] def __init__(self, data: Union[np.ndarray, list, pd.Series, pd.DataFrame] = None): """ Initialize the data processor. Parameters ---------- data : array-like, optional Data to be processed. Can be: - numpy array - list of values - pandas Series - pandas DataFrame (will be flattened) """ self.data = None self.metadata = {} # Store original data to preserve datetime index for extremes analysis self._original_data = None # Internal adjuster for distribution fitting self._adjuster = None if data is not None: self.load_data(data)
[docs] def load_data(self, data: Union[np.ndarray, list, pd.Series, pd.DataFrame]) -> 'DataProcessor': """ Load data and convert to numpy array. Parameters ---------- data : array-like Data to be loaded. Can be: - numpy array - list of values - pandas Series - pandas DataFrame (will be flattened) Returns ------- DataProcessor Processor instance with loaded data """ # Store original data before conversion self._original_data = data if isinstance(data, np.ndarray): self.data = data.flatten() if data.ndim > 1 else data.copy() elif isinstance(data, list): self.data = np.array(data, dtype=float) elif isinstance(data, pd.Series): self.data = data.values elif isinstance(data, pd.DataFrame): self.data = data.values.flatten() else: # Try to convert any array-like object try: self.data = np.array(data, dtype=float).flatten() except (ValueError, TypeError): raise TypeError(f"Cannot convert data of type {type(data)} to numpy array.") # Remove NaN values and warn if any were found if np.any(np.isnan(self.data)): n_nan = np.sum(np.isnan(self.data)) warnings.warn(f"Found {n_nan} NaN values. They will be removed.") self.data = self.data[~np.isnan(self.data)] if len(self.data) == 0: raise ValueError("No valid data points after removing NaN values.") self._update_metadata() return self
[docs] def get_data_array(self) -> np.ndarray: """ Return data as 1D numpy array. Returns ------- ndarray 1D numpy array with the data """ if self.data is None: raise ValueError("No data has been loaded.") return self.data.copy()
[docs] def get_basic_stats(self) -> Dict[str, Any]: """ Return basic descriptive statistics of the data. Returns ------- dict Dictionary with basic statistics """ if self.data is None: raise ValueError("No data has been loaded.") return { 'count': len(self.data), 'mean': np.mean(self.data), 'std': np.std(self.data, ddof=1), # Sample standard deviation 'var': np.var(self.data, ddof=1), # Sample variance 'min': np.min(self.data), 'max': np.max(self.data), 'median': np.median(self.data), 'q25': np.percentile(self.data, 25), 'q75': np.percentile(self.data, 75) }
def _get_adjuster(self): """Get or create the internal adjuster.""" if self._adjuster is None: from .magic_adjuster import MagicAdjuster self._adjuster = MagicAdjuster(self) return self._adjuster
[docs] def fit_distribution(self, distribution: Union[str, object], **kwargs) -> 'DataProcessor': """ Fit a statistical distribution to the data. Parameters ---------- distribution : str or scipy.stats distribution Distribution to fit. Can be: - String: 'weibull', 'gamma', 'lognorm', 'norm', etc. - SciPy distribution object: stats.weibull_min, stats.gamma, etc. **kwargs : dict Additional arguments passed to the distribution's fit method Returns ------- DataProcessor Processor instance with fitted distribution """ adjuster = self._get_adjuster() adjuster.fit_distribution(distribution, **kwargs) return self
[docs] def get_fitted_params(self) -> Tuple: """ Get the fitted distribution parameters. Returns ------- tuple Fitted parameters of the distribution """ if self._adjuster is None: raise ValueError("No distribution has been fitted yet.") return self._adjuster.get_fitted_params()
[docs] def get_distribution_info(self) -> Dict[str, Any]: """ Get information about the fitted distribution. Returns ------- dict Dictionary with distribution information """ if self._adjuster is None: raise ValueError("No distribution has been fitted yet.") return self._adjuster.get_distribution_info()
[docs] def get_auto_fitter(self, candidates=None, criterion='rmse'): """ Create an AutoFitter instance for automatic distribution selection. This method follows the factory pattern - it creates an AutoFitter instance when needed, allowing automatic testing of multiple distributions to find the best fit based on specified criteria. Parameters ---------- candidates : list of str, optional List of distribution names to test. If None, uses default set including: 'weibull_min', 'lognorm', 'gamma', 'norm', 'expon', 'rayleigh', 'chi2', 'beta' criterion : str, default 'rmse' Selection criterion ('rmse', 'aic', 'bic', 'ks_pvalue', 'chi2_pvalue') Returns ------- AutoFitter AutoFitter instance configured with this data Examples -------- >>> import magica as ma >>> import numpy as np >>> >>> # Load data >>> data = np.random.weibull(2, 1000) >>> processor = ma.read_data(data) >>> >>> # Get auto fitter and find best distribution >>> auto_fitter = processor.get_auto_fitter() >>> best_result = auto_fitter.fit_best_distribution() >>> print(f"Best distribution: {best_result['distribution']}") """ # Import here to avoid circular imports from .auto_fitter import AutoFitter if self.data is None: raise ValueError("No data has been loaded.") return AutoFitter(self, candidates=candidates, criterion=criterion)
[docs] def get_extremes_analyzer( self, times: Optional[Union[np.ndarray, pd.Series, pd.DatetimeIndex]] = None, time_unit: str = 'years' ): """ Create an ExtremesAnalyzer instance for extreme value analysis. This method provides access to return period and return value analysis for extreme events in time series data. Parameters ---------- times : array-like, optional Time values corresponding to data points. Can be: - pandas DatetimeIndex - pandas Series with datetime values - numpy array of datetime64 - numpy array of numeric values (e.g., years) - None if data is pandas Series with datetime index time_unit : str, default='years' Unit for return period calculations. Options: 'years', 'days', 'hours', 'months' Returns ------- ExtremesAnalyzer ExtremesAnalyzer instance configured with this data Examples -------- >>> import numpy as np >>> import pandas as pd >>> import magica as ma >>> >>> # Using pandas Series with datetime index >>> dates = pd.date_range('2000-01-01', periods=1000, freq='D') >>> values = np.random.weibull(2, 1000) * 10 >>> series = pd.Series(values, index=dates) >>> >>> processor = ma.read_data(series) >>> extremes = processor.get_extremes_analyzer() >>> extremes.fit_distribution('genextreme') >>> >>> # Calculate 100-year return value >>> rv_100 = extremes.return_value(100) >>> >>> # Or provide times separately >>> processor2 = ma.read_data(values) >>> extremes2 = processor2.get_extremes_analyzer(times=dates) """ # Import here to avoid circular imports from .extremes_analyzer import ExtremesAnalyzer if self.data is None: raise ValueError("No data has been loaded.") return ExtremesAnalyzer(self, times=times, time_unit=time_unit)
def _update_metadata(self): """Update data metadata.""" if self.data is not None: self.metadata['length'] = len(self.data) self.metadata['dtype'] = str(self.data.dtype) self.metadata['last_updated'] = pd.Timestamp.now()
[docs] def __repr__(self) -> str: """String representation of the object.""" if self.data is None: return "DataProcessor(no data loaded)" # Check if distribution is fitted via adjuster dist_info = "" if self._adjuster is not None and self._adjuster.distribution_name is not None: dist_info = f", distribution='{self._adjuster.distribution_name}'" return f"DataProcessor(length={len(self.data)}, dtype={self.data.dtype}{dist_info})"
[docs] def __len__(self) -> int: """Return the length of the data.""" if self.data is None: return 0 return len(self.data)
[docs] def __getattr__(self, name): """ Delegate method calls to the internal MagicAdjuster. This allows direct access to all scipy.stats methods like: cdf, pdf, ppf, sf, isf, rvs, stats, etc. Parameters ---------- name : str Name of the method/attribute to access Returns ------- Any Method or attribute from the fitted distribution """ # First check if we have an adjuster with a fitted distribution if self._adjuster is None: raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'. " "Did you forget to call fit_distribution() first?") # Delegate to the adjuster's __getattr__ try: return getattr(self._adjuster, name) except AttributeError: raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")