Source code for ctd_tools.modules.subsetter

import xarray
import pandas as pd
import ctd_tools.ctd_parameters as ctdparams

[docs] class CtdSubsetter: """ Subsets sensor data based on sample number, time, and parameter values. This class allows for flexible slicing of sensor data stored in an xarray Dataset. It can filter data based on sample indices, time ranges, and specific parameter values. Example usage: subsetter = CtdSubsetter(ds) subsetter \ .set_sample_min(10) \ .set_sample_max(50) \ .set_time_min("2023-01-01") \ .set_time_max("2023-01-31") ds_subset = subsetter.get_subset() Attributes: ------------ data : xarray.Dataset The xarray Dataset containing the sensor data to be subsetted. min_sample : int, optional The minimum sample index to include in the subset. max_sample : int, optional The maximum sample index to include in the subset. min_datetime : pd.Timestamp, optional The minimum time to include in the subset. max_datetime : pd.Timestamp, optional The maximum time to include in the subset. parameter_name : str, optional The name of the parameter to filter by. parameter_value_min : float, optional The minimum value of the parameter to include in the subset. parameter_value_max : float, optional The maximum value of the parameter to include in the subset. """ def __init__(self, data: xarray.Dataset): """ Initializes the CtdSubsetter with the provided xarray Dataset. Parameters: ----------- data (xarray.Dataset): The xarray Dataset containing the sensor data to be subsetted. Raises: ------- TypeError: If the provided data is not a xarray.Dataset. """ if not isinstance(data, xarray.Dataset): raise TypeError("Data must be a xarray.Dataset") # Store the dataset self.data = data # Initialize slicing parameters self.min_sample = None self.max_sample = None self.min_datetime = None self.max_datetime = None self.parameter_name = None self.parameter_value_max = None self.parameter_value_min = None
[docs] def set_sample_min(self, value: int) -> "CtdSubsetter": """ Sets the minimum sample index for slicing the dataset. Parameters: ----------- value (int): The minimum sample index to include in the subset. Returns: -------- CtdSubsetter: The current instance of CtdSubsetter with the updated minimum sample index. Raises: ------- TypeError: If the provided value is not an integer. """ # Validate the sample index if not isinstance(value, int): raise TypeError("Sample index must be an integer") # Store the minimum sample index self.min_sample = value return self
[docs] def set_sample_max(self, value: int) -> "CtdSubsetter": """ Sets the maximum sample index for slicing the dataset. Parameters: ----------- value (int): The maximum sample index to include in the subset. Returns: -------- CtdSubsetter: The current instance of CtdSubsetter with the updated maximum sample index. """ self.max_sample = value return self
def __handle_time_value(self, value: str | pd.Timestamp) -> pd.Timestamp: """ Converts a time value to a pandas Timestamp. Parameters: ----------- value (str or pd.Timestamp): The time value to convert. Can be a string or a pandas Timestamp. Returns: -------- pd.Timestamp: The converted time value as a pandas Timestamp. Raises: ------- TypeError: If the provided value is not a string or a pandas Timestamp. """ # Validate the time value if not isinstance(value, (str, pd.Timestamp)): raise TypeError("Time value must be a string or a pandas Timestamp") # Convert the time value to a pandas Timestamp datetime_object = None if isinstance(value, str): datetime_object = pd.Timestamp(value) elif isinstance(value, pd.Timestamp): datetime_object = value return datetime_object
[docs] def set_time_min(self, value: str | pd.Timestamp) -> "CtdSubsetter": """ Sets the minimum time for slicing the dataset. Parameters: ----------- value (str or pd.Timestamp): The minimum time to include in the subset. Can be a string or a pandas Timestamp. Returns: -------- CtdSubsetter: The current instance of CtdSubsetter with the updated minimum time. Raises: ------- TypeError: If the provided value is not a string or a pandas Timestamp. """ # Validate the time value if not isinstance(value, (str, pd.Timestamp)): raise TypeError("Time value must be a string or a pandas Timestamp") # Convert the time value to a pandas Timestamp and store it self.min_datetime = self.__handle_time_value(value) return self
[docs] def set_time_max(self, value): """ Sets the maximum time for slicing the dataset. Parameters: ----------- value (str or pd.Timestamp): The maximum time to include in the subset. Can be a string or a pandas Timestamp. """ # Validate the time value if not isinstance(value, (str, pd.Timestamp)): raise TypeError("Time value must be a string or a pandas Timestamp") # Convert the time value to a pandas Timestamp and store it self.max_datetime = self.__handle_time_value(value) return self
[docs] def set_parameter_name(self, value: str): """ Sets the name of the parameter to filter by. Parameters: ----------- value (str): The name of the parameter to filter by. This should be a valid variable name in the dataset. Returns: -------- CtdSubsetter: The current instance of CtdSubsetter with the updated parameter name. Raises: ------- TypeError: If the provided value is not a string. ValueError: If the provided parameter name is not found in the dataset. """ # Validate the parameter name if not isinstance(value, str): raise TypeError("Parameter name must be a string") if value not in self.data: raise ValueError(f"Parameter '{value}' not found in dataset") # Store the parameter name self.parameter_name = value return self
[docs] def set_parameter_value_max(self, value): """ Sets the maximum value of the parameter to include in the subset. Parameters: ----------- value (float): The maximum value of the parameter to include in the subset. Returns: -------- CtdSubsetter: The current instance of CtdSubsetter with the updated maximum parameter value. Raises: ------- TypeError: If the provided value is not a number (int or float). """ # Validate the parameter value if not isinstance(value, (int, float)): raise TypeError("Parameter value must be a number (int or float)") # Store the maximum parameter value self.parameter_value_max = value return self
[docs] def set_parameter_value_min(self, value): """ Sets the minimum value of the parameter to include in the subset. Parameters: ----------- value (float): The minimum value of the parameter to include in the subset. Returns: -------- CtdSubsetter: The current instance of CtdSubsetter with the updated minimum parameter value. Raises: ------- TypeError: If the provided value is not a number (int or float). """ # Validate the parameter value if not isinstance(value, (int, float)): raise TypeError("Parameter value must be a number (int or float)") # Store the minimum parameter value self.parameter_value_min = value return self
def __slice_by_sample_number(self, subset: xarray.Dataset) -> xarray.Dataset: """ Slices the dataset by sample number (index). This method filters the dataset based on the minimum and maximum sample indices set by the user. Parameters: ----------- subset (xarray.Dataset): The xarray Dataset to be sliced by sample number. Returns: -------- xarray.Dataset: The subset of the dataset that matches the specified sample number criteria. Raises: ------- ValueError: If the dataset does not contain the sample index coordinate. """ # Check if the dataset has a sample index coordinate if ctdparams.TIME not in subset.coords: raise ValueError(f"Dataset does not contain '{ctdparams.TIME}' " \ "coordinate for slicing by sample number") # Get the time values from the dataset time_values = subset[ctdparams.TIME].values if self.min_sample is not None and self.max_sample is not None: selection_criteria = {ctdparams.TIME: slice( time_values[self.min_sample], time_values[self.max_sample])} subset = subset.sel(**selection_criteria) elif self.min_sample is not None: selection_criteria = {ctdparams.TIME: slice(time_values[self.min_sample], None)} subset = subset.sel(**selection_criteria) elif self.max_sample is not None: selection_criteria = {ctdparams.TIME: slice(None, time_values[self.max_sample])} subset = subset.sel(**selection_criteria) return subset def __slice_by_time(self, subset: xarray.Dataset) -> xarray.Dataset: """ Slices the dataset by time. This method filters the dataset based on the minimum and maximum time values set by the user. Parameters: ----------- subset (xarray.Dataset): The xarray Dataset to be sliced by time. Returns: -------- xarray.Dataset: The subset of the dataset that matches the specified time criteria. """ # Check if the dataset has a time coordinate if ctdparams.TIME not in subset.coords: raise ValueError(f"Dataset does not contain '{ctdparams.TIME}' " \ "coordinate for slicing by time") # If min or max datetime is set, slice the dataset accordingly if self.min_datetime or self.max_datetime: slice_obj = slice(self.min_datetime, self.max_datetime) subset = subset.sel(**{ctdparams.TIME: slice_obj}) return subset def __slice_by_parameter_value(self, subset: xarray.Dataset) -> xarray.Dataset: """ Slices the dataset by parameter values. This method filters the dataset based on the specified parameter name and its minimum and maximum values. Parameters: ----------- subset (xarray.Dataset): The xarray Dataset to be sliced by parameter values. Returns: -------- xarray.Dataset: The subset of the dataset that matches the specified parameter value criteria. """ # Check if the parameter name is set if self.parameter_name: # Check if the parameter exists in the dataset if self.parameter_name not in subset: raise ValueError(f"Parameter '{self.parameter_name}' not available") # If min value is set, filter the dataset for values greater than or equal to min value if self.parameter_value_min: subset = subset.where(subset[self.parameter_name] >= \ self.parameter_value_min, drop=True) # If max value is set, filter the dataset for values less than or equal to max value if self.parameter_value_max: subset = subset.where(subset[self.parameter_name] <= \ self.parameter_value_max, drop=True) return subset
[docs] def get_subset(self) -> xarray.Dataset: """ Returns the subset of the dataset based on the specified slicing parameters. This method applies the slicing parameters set by the user to filter the dataset. It slices the dataset by sample number, time, and parameter values as specified. Returns: -------- xarray.Dataset: The subset of the dataset that matches the specified slicing parameters. Raises: ------- TypeError: If the provided data is not a xarray.Dataset. """ # Validate the dataset if not isinstance(self.data, xarray.Dataset): raise TypeError("Data must be an xarray.Dataset") # Start with the full dataset subset = self.data # Slice by sample / index number subset = self.__slice_by_sample_number(subset) # Slice by time subset = self.__slice_by_time(subset) # Slice by parameter / variable values subset = self.__slice_by_parameter_value(subset) return subset