Source code for pyMAISE.datasets._handler

from pathlib import Path

import numpy as np
import pandas as pd
import xarray as xr

from pyMAISE.preprocessing import read_csv


def _get_full_path(path: str):
    """Get full pyMAISE data file path."""
    return str(Path(__file__).parent / path)


[docs] def load_MITR(): """ Load MIT reactor data. There are six inputs: control blade height :math:`[cm]` (labeled as ``CB#``), and 22 outputs (labeled as ``A-#``, ``B-#``, or ``C-#``), fuel element power :math:`[W]`. This data comes from :cite:`RADAIDEH2023112423` and was constructed through the perturbation of the control blade heights and the reactor simulation in MCNP to determine the expected power in each element. This data set includes 1000 samples. Returns ------- data: xarray.DataArray Raw MIT reactor data. inputs: xarray.DataArray Control blade heights. outputs: xarray.DataArray Fuel element power. """ return read_csv( [_get_full_path("crx.csv"), _get_full_path("powery.csv")], )
[docs] def load_chf(data_path=None): """ Load Critical Heat Flux (CHF) synthetic data. This dataset is based on that produced by the NEA in :cite:`CHF_Benchmark`. The NEA dataset was generated from vertical water-cooled uniformly heated tubes, producing 24576 samples from 59 different sources of measurement. The dataset was collected over experimental measurements spanning 60 years of CHF data collection methods such as visual identification, physical burnout, changes in the test section resistances, and the usage of thermocouples. The input parameters collected include. - ``D (m)``: Diameter of the test section (:math:`0.002 - 0.016~m`), - ``L (m)``: Heated length (:math:`0.07 - 15.0~m`), - ``P (kPa)``: Pressure (:math:`100-20000~kPa`), - ``G (kg m-2s-1)``: Mass flux (:math:`17.7-7712.0~\\frac{kg}{m^2\\cdot s}`), - ``Tin (C)``: Inlet temperature length (:math:`9.0-353.62^\\circ C`), - ``Xe (-)``: Outlet equilibrium quality (:math:`-0.445-0.986`), with output - ``CHF (kW m-2)``: Critical heat flux (:math:`130.0-13345.0~\\frac{kW}{m^2}`). Negative equilibrium quality (:math:`X`) could represent a subcooled fluid. The database was limited to a diameter between (:math:`2 < D < 25~mm`), :math:`L/D` ratio (:math:`L/D > 50` for :math:`X > 0`, :math:`L/D > 25` for :math:`X < 0`), pressure (:math:`100 \\le P \\le 21000 kPa`), and mass flux (:math:`0 \\le G < 8000~\\frac{kg}{m^2\\cdot s}`). The measured data was also not equally distributed over the whole span along with no data beyond $D=16~mm$ was found in the database. As the original dataset is confidential, we do not open-source it here on pyMAISE; therefore, what is given here is a synthetic version designed to mimic the behavior of the real data. It consists of 2500 samples, 2000 training samples and 500 testing samples. These were generated by adding random noise to the experimental data. Parameters ---------- data_path: None, str, or list of two str(s), default=None Path(s) to real NEA data. If ``data_path`` is ``None`` or a list of two strings: Returns ------- train_data: xarray.DataArray Synthetic CHF training data. xtrain: xarray.DataArray Training inputs. ytrain: xarray.DataArray Training output. test_data: xarray.DataArray Synthetic CHF testing data. xtest: xarray.DataArray Training inputs. ytest: xarray.DataArray Training output. If ``data_path`` is a string: Returns ------- data: xarray.DataArray Raw CHF data. inputs: xarray.DataArray Input data. outputs: xarray.DataArray Output data. """ if data_path: if isinstance(data_path, str): return read_csv( data_path, input_slice=slice(0, 6), output_slice=slice(6, 7), ) elif ( isinstance(data_path, list) and len(data_path) == 2 and isinstance(data_path[0], str) and isinstance(data_path[1], str) ): train_data, xtrain, ytrain = read_csv( data_path[0], input_slice=slice(0, 6), output_slice=slice(6, 7), ) test_data, xtest, ytest = read_csv( data_path[1], input_slice=slice(0, 6), output_slice=slice(6, 7), ) return train_data, xtrain, ytrain, test_data, xtest, ytest else: raise RuntimeError("Invalid data_path, must be string of list of string") else: train_data, xtrain, ytrain = read_csv( _get_full_path("chf_train_synth.csv"), input_slice=slice(0, 6), output_slice=slice(6, 7), ) test_data, xtest, ytest = read_csv( _get_full_path("chf_test_synth.csv"), input_slice=slice(0, 6), output_slice=slice(6, 7), ) return train_data, xtrain, ytrain, test_data, xtest, ytest
[docs] def load_xs(): """ Load reactor physics data. There are 1000 samples with eight cross sections (XS) :math:`[cm^{-1}]` as inputs: - ``FissionFast``: fast fission, - ``CaptureFast``: fast capture, - ``FissionThermal``: thermal fission, - ``CaptureThermal``: thermal capture, - ``Scatter12``: group 1 to 2 scattering, - ``Scatter11``: group 1 to 1 scattering, - ``Scatter21``: group 2 to 1 scattering, - ``Scatter22``: group 2 to 2 scattering, with output of :math:`k`, the neutron multiplication factor. This data was taken from :cite:`RADAIDEH2019264`, a sensitivity analysis using the Shapley effect. The geometry of the problem is a pressurized water reactor (PWR) lattice based on the BEAVRS benchmark. The lattice utilizes quarter core symmetry in TRITON and is depleted to :math:`50~GWD/MTU`. The data was constructed using a two-step process: 1. the uncertainty in the fundamental microscopic XS data was propagated, 2. and these XSs were collapsed into a 2-group form using .. math:: \\Sigma_x^g = \\frac{\\int_{\\Delta E_g}dE\\int_V\\Sigma_{x, m}(E) \\phi(r, E, t)dV}{\\int_{\\Delta E_g}dE\\int_V\\phi(r, E, t)dV}. The Sampler module in SCALE was used for uncertainty propagation, and the 56-group XS and covariance libraries were used in TRITON to create 56-group homogeneous XSs using the above equation. The homogeneous XSs were then collapsed into a 2-group library. One thousand random samples were taken from the Sampler. Returns ------- data: xarray.DataArray Raw reactor physics data. inputs: xarray.DataArray Cross sections. outputs: xarray.DataArray :math:`k`, neutron multiplication factor, data. """ return read_csv(_get_full_path("xs.csv"), slice(0, -1), slice(-1, None))
[docs] def load_fp(): """ Load fuel performance data. This data set consists of 13 inputs: - ``fuel_dens``: fuel density :math:`[kg/m^3]`, - ``porosity``: porosity, - ``clad_thick``: cladding thickness :math:`[m]`, - ``pellet_OD``: pellet outer diameter :math:`[m]`, - ``pellet_h``: pellet height :math:`[m]`, - ``gap_thickness``: gap thickness :math:`[m]`, - ``inlet_T``: inlet temperature :math:`[K]`, - ``enrich``: U-235 enrichment, - ``rough_fuel``: fuel roughness :math:`[m]`, - ``rough_clad``: cladding roughness :math:`[m]`, - ``ax_pow``: axial power, - ``clad_T``: cladding surface temperature :math:`[K]`, - ``pressure``: pressure :math:`[Pa]`, and four outputs: - ``fis_gas_produced``: fission gas production :math:`[mol]`, - ``max_fuel_centerline_temp``: max fuel centerline temperature :math:`[K]`, - ``max_fuel_surface_temperature``: max fuel surface temperature :math:`[K]`, - ``radial_clad_dia``: radial cladding diameter displacement after irradiation :math:`[m]`, with 400 data points. This data is case 1 from :cite:`RADAIDEH2020106731` which is based on the pellet-cladding mechanical interaction (PCMI) benchmark. The 13 inputs were uniformly randomly sampled independently within their uncertainty bounds and simulated in BISON. The rod response was recorded in four outputs. Returns ------- data: xarray.DataArray Raw fuel performance data. inputs: xarray.DataArray 13 inputs. outputs: xarray.DataArray 4 outputs. """ return read_csv([_get_full_path("fp_inp.csv"), _get_full_path("fp_out.csv")])
[docs] def load_heat(): """ Load the heat conduction data. This data consists of 1000 samples of 7 inputs: - ``qprime``: linear heat generation rate :math:`[W/m]`, - ``mdot``: mass flow rate :math:`[g/s]`, - ``Tin``: temperature of the fuel boundary :math:`[K]`, - ``R``: fuel radius :math:`[m]`, - ``L``: fuel length :math:`[m]`, - ``Cp``: heat capacity :math:`[J/(g\\cdot K)]`, - ``k``: thermal conductivity :math:`[W/(m\\cdot K)]`, with one output: - ``T``: fuel centerline temperature :math:`[K]`. The data set was constructed through Latin hypercube sampling of the seven input parameters for heat conduction through a fuel rod. These samples were then used to solve for the fuel centerline temperature analytically. We assume volumetric heat generation is uniform radially. The problem is defined by .. math:: \\frac{1}{r}\\frac{d}{dr}\\Big(kr\\frac{dT}{dr}\\Big) + q''' = 0 with two boundary conditions: :math:`\\frac{dT}{dr}\\Big|_{r=0}=0` and :math:`T(R) = T_{in}`. Therefore, the temperature profile in the fuel is .. math:: T(r) = \\frac{q'}{4\\pi k}(1 - (r/R)^2) + T_{in}. Returns ------- data: xarray.DataArray Raw heat conduction data. inputs: xarray.DataArray Seven inputs. outputs: xarray.DataArray Fuel centerline temperature. """ return read_csv(_get_full_path("heat.csv"), slice(0, -1), slice(-1, None))
[docs] def load_rea(): """ Load NEACRP C1 rod ejection accident (REA) data. This data consists of 2000 samples of four inputs: - ``rod_worth``: reactivity worth of the ejected rod, - ``beta``: delayed neutron fraction, - ``h_gap``: gap conductance :math:`[W/(m^2\\cdot K)]`, - ``gamma_frac``: direct heating fraction, with four outputs: - ``max_power``: peak power reached during transient :math:`[\\%FP]`, - ``burst_width``: Width of power burst :math:`[s]`, - ``max_TF``: max fuel centerline temperature :math:`[K]`, - ``avg_Tcool``: average coolant outlet temperature :math:`[K]`. This data set was constructed by perturbing the inputs listed above before REA transient simulated in PARCS. Returns ------- data: xarray.DataArray Raw rod ejection data. inputs: xarray.DataArray Four inputs. outputs: xarray.DataArray Four outputs. """ return read_csv( [ _get_full_path("rea_inputs.csv"), _get_full_path("rea_outputs.csv"), ], )
[docs] def load_BWR(): """ Load BWR Micro Core data. This data consists of 2000 samples of 9 inputs: - ``PSZ``: Fuel bundle region Power Shaping Zone (PSZ), - ``DOM``: Fuel bundle region Dominant zone (DOM), - ``vanA``: Fuel bundle region vanishing zone A (VANA), - ``vanB``: Fuel bundle region vanishing zone B (VANB), - ``subcool``: Represents moderator inlet conditions. Core inlet subcooling is interpreted to be at the steam dome pressure (i.e., not core-averaged pressure). The input value for subcooling will automatically be increased to account for this fact. (Btu/lb), - ``CRD``: Defines the position of all control rod groups (banks), - ``flow_rate``: Defines essential global design data for rated coolant mass flux for the active core, :math:`\\frac{kg}{(cm^{2}-hr)}`. Coolant mass flux equals active core flow divided by core cross-section area. The core cross-section area is DXA 2 times the number of assemblies, - ``power_density``: Defines essential global design data for rated power density using cold dimensions, :math:`(\\frac{kw}{liter})`, - ``VFNGAP``: Defines the ratio of narrow water gap width to the sum of the narrow and wide water gap widths, with five outputs: - ``K-eff``: Reactivity coefficient k-effective, the effective neutron multiplication factor, - ``Max3Pin``: Maximum planar-averaged pin power peaking factor, - ``Max4Pin``: maximum pin-power peaking factor, :math:`F_{q}`, (which includes axial intranodal peaking), - ``F-delta-H``: Ratio of max-to-average enthalpy rise in a channel, - ``Max-Fxy``: Maximum radial pin-power peaking factor, This data set was constructed through uniform and normal sampling of the 9 input parameters for a boiling water reactor (BWR) micro-core. These samples were then used to solve for reactor characteristic changes in heat distribution and neutron flux. This BWR micro-core consists of 4 radially and axially heterogeneous assemblies of the same type constructed in a 2x2 grid with a control-blade placed in the center. A single assembly was broken into seven zones where each zone's 2D radial cross-sectional information was constructed using CASMO-4. These cross sectional libraries were then processed through CMSLINK for SIMULATE-3 to interpret. The core geometry and physics were implemented and modeled using SIMULATE-3. Returns ------- data: xarray.DataArray Raw BWR Micro Reactor data. inputs: xarray.DataArray 9 inputs. outputs: xarray.DataArray 5 outputs. """ return read_csv( [ _get_full_path("bwr_input.csv"), _get_full_path("bwr_output.csv"), ], )
[docs] def load_HTGR(): """ Load HTGR Micro Reactor data. This data consists of 751 samples of 8 inputs: - ``theta_{1}``: Angle of control drum in quadrant 1 (degrees), - ``theta_{2}``: Angle of control drum in quadrant 1 (degrees), - ``theta_{3}``: Angle of control drum in quadrant 2 (degrees), - ``theta_{4}``: Angle of control drum in quadrant 2 (degrees), - ``theta_{5}``: Angle of control drum in quadrant 3 (degrees), - ``theta_{6}``: Angle of control drum in quadrant 3 (degrees), - ``theta_{7}``: Angle of control drum in quadrant 4 (degrees), - ``theta_{8}``: Angle of control drum in quadrant 4 (degrees), with 4 outputs: - ``FluxQ1``: Neutron flux in quadrant 1 :math:`(\\frac{neutrons}{cm^{2} s})`, - ``FluxQ2``: Neutron flux in quadrant 2 :math:`(\\frac{neutrons}{cm^{2} s})`, - ``FluxQ3``: Neutron flux in quadrant 3 :math:`(\\frac{neutrons}{cm^{2} s})`, - ``FluxQ4``: Neutron flux in quadrant 4 :math:`(\\frac{neutrons}{cm^{2} s})`, This data set is based on the HOLOS-Quad reactor design. This reactor implements modular construction where separate units can be transported independently and assembled at the plant. The HOLOS-Quad core is a 8 cylindrical control drums control a 22 MWt high-temperature gas-cooled microreactor (HTGR). It utilizes TRISO fuel particles contained in hexagonal graphite blocks as a moderator. These graphite blocks have channels where helium gas can pass through for cooling. The main importance of this data set is the influence of the control drum position on the neutron flux distribution. The drums control reactivity by rotating to vary the proximity of :math:`B_{4}C`, located on a portion of the cylindrical surface, to the fuel. Perturbations of the control drums cause the core power shape to shift, leading to complex power distributions. Therefore, predictions of control drum reactivity worth for arbitrary configurations make this problem nontrivial. The data was taken from :cite:`PRICE2022111776` and does not utilize symmetry preprocessing to expand the data set. Returns ------- data: xarray.DataArray Raw HTGR Micro Reactor data with no symmetry conditions applied. inputs: xarray.DataArray Eight inputs. outputs: xarray.DataArray Four outputs. """ return read_csv(_get_full_path("microreactor.csv"), slice(29, 37), slice(4, 8))
[docs] def load_loca(stack_series=False): """ Load loss of coolant accident (LOCA) time series data. This data comes from :cite:`RADAIDEH2020113699` and consists of 40 time-independent features that describe the initial state of the reactor during the LOCA transient, which are propagated out in time. There are four sequences: - ``Pellet Cladding Temperature``: pellet cladding temperature [K], - ``Core Pressure``: core pressure [Pa], - ``Water Level``: water level [m], - ``Break Flow Rate``: break flow rate [kg/s], with 400 time steps. The original data was randomly sampled for 2000 perturbed data points and the nominal sample, 2001 samples total. Parameters ---------- stack_series: bool, default=False If false, the data is loaded in 3D with dimensions (samples, time steps, features). If true, the data is loaded in 2D with the sequences stacked like pulse data resulting in dimensions (time steps, features). Returns ------- nominal_data: xarray.DataArray The 2D or 3D nominal LOCA data. If 2D it will be shaped (400, 44) and if 3D then the shape is (1, 400, 44). perturbed_data: xarray.DataArray The 2D or 3D perturbed LOCA data. If 2D it is shape (800000, 44) and if 3D then the shape is (2000, 400, 44). """ # Paths input_path = _get_full_path("loca_inp.csv") output_paths = { "Pellet Cladding Temperature": "loca_pct.csv", "Core Pressure": "loca_core_pressure.csv", "Water Level": "loca_water_level.csv", "Break Flow Rate": "loca_break_flow_rate.csv", } # Read outputs and concatenate arrays outputs = [] for path in output_paths.values(): outputs.append( pd.read_csv(_get_full_path(path), header=None).values.T[:, :, np.newaxis] ) outputs = np.concatenate(outputs, axis=-1) # Read inputs and propogate time-independent variables in time raw_inputs = pd.read_csv(input_path) inputs = np.repeat(raw_inputs.values[:, np.newaxis, :], outputs.shape[1], axis=1) # Combine into one data set all_data = np.concatenate((inputs, outputs), axis=-1) # If we are not stacking the time series then return 3D data else we # stack and return 2D data if not stack_series: nominal_data = xr.DataArray( all_data[[0],], coords={ "samples": [0], "time steps": np.arange(all_data.shape[1]), "features": list(raw_inputs.columns) + list(output_paths.keys()), }, ) perturbed_data = xr.DataArray( all_data[1:,], coords={ "samples": np.arange(all_data.shape[0] - 1), "time steps": np.arange(all_data.shape[1]), "features": list(raw_inputs.columns) + list(output_paths.keys()), }, ) return nominal_data, perturbed_data else: nominal_data = xr.DataArray( all_data[[0],].reshape((-1, all_data.shape[-1])), coords={ "time steps": np.arange(all_data.shape[1]), "features": list(raw_inputs.columns) + list(output_paths.keys()), }, ) perturbed_data = xr.DataArray( all_data[1:,].reshape((-1, all_data.shape[-1])), coords={ "time steps": np.arange((all_data.shape[0] - 1) * all_data.shape[1]), "features": list(raw_inputs.columns) + list(output_paths.keys()), }, ) return nominal_data, perturbed_data
[docs] def load_anomaly( input_path, output_path, stack_series=False, multiclass=False, propagate_output=False, non_faulty_frac=1.0, timestep_step=1, ): """ Load time series electronic signal data from `Mendeley <https://da\ ta.mendeley.com/datasets/kbbrw99vh8/5>`_ provided by :cite:`RADAIDEH2022103704,\ radaideh2023early`. This dataset derives from the measurement of 14 parameters of the high voltage converter modulators (HVCMs) used at the Spallation Neutron Source facility. Each of these waveforms were classified as "fault" or "run" depending on the failure of the HVCM during operation. The 14 waveform inputs are: - ``A+IGBT-I: current``: Current passing through the IGBT switch of \ phase A+ in Qa1 (:math:`A`) - ``A+*IGBT-I: current``: Current passing through the IGBT switch of \ phase A+\\* in Qa3 (:math:`A`) - ``B+IGBT-I: current``: Current passing through the IGBT switch of \ phase B+ in Qb1 (:math:`A`) - ``B+*IGBT-I: current``: Current passing through the IGBT switch of \ phase B+\\* in Qb3 (:math:`A`) - ``C+IGBT-I: current``: Current passing through the IGBT switch of \ phase C+ in Qc1 (:math:`A`) - ``C+*IGBT-I: current``: Current passing through the IGBT switch of \ phase C+\\* in Qc3 (:math:`A`) - ``A-Flux``: Magnetic flux density for phase A in transformer XA (:math:`-`) - ``B-Flux``: Magnetic flux density for phase B in transformer XB (:math:`-`) - ``C-Flux``: Magnetic flux density for phase C in transformer XC (:math:`-`) - ``Mod-V``: Modulator voltage (:math:`V`) - ``Mod-I``: Modulator current (:math:`A`) - ``CB-I``: Cap bank current (:math:`-`) - ``CB-V``: Cap bank voltage (:math:`V`) - ``DV/DT``: Time derivative change of the Mod\\-V voltage (:math:`-`) There is one output for this dataset: - ``Class_Run``/``Class_Fault``: Whether a waveform is a part of a system fault .. note:: The outputs returned by this are not one-hot encoded. It is a single label with class "Run" and "Fault". Parameters ---------- input_path: str Path to input file. Raw data can be found at `Mendeley <https://da\ ta.mendeley.com/datasets/kbbrw99vh8/5>`_ output_path: str Path to output file. Raw data can be found at `Mendeley <https://da\ ta.mendeley.com/datasets/kbbrw99vh8/5>`_ stack_series: bool, default=False If true, then the samples and time steps dimensions are combined. ``propagate_output`` must be true for ``stack_series`` to be true. multiclass: bool, default=False If true, then the multiclass case is returned with 8 possible classifications: ``Normal``, ``IGBT Fault``, `` CBor TPS Fault``, ``Driver Fault``, ``Flux Fault``, ``DV/DT Fault``, ``SCR AC Input Fault``, or ``Misc/Unknown``. If this is false then the binary class is returned (``Run`` or ``Fault``). non_faulty_frac: float, default=1.0 The fraction of non-faulty data to include. timestep_step: int, default=1 Time steps are taken every other ``timestep_step``. When ``timestep_step == 1`` all timesteps are given. Returns ------- inputs: xarray.DataArray 14 inputs. outputs: xarray.DataArray 1 output. """ # Load the data X = np.load(input_path)[:, ::timestep_step, :] Y = np.load( output_path, allow_pickle=True, ).astype( str )[:, 1:] # Get desired fraction of non-faulty pulses run_idxs = np.argwhere(Y[:, 0] == "Run").flatten() faulty_idxs = np.argwhere(Y[:, 0] == "Fault").flatten() frac_run_idxs = np.random.choice( run_idxs, size=int(run_idxs.size * non_faulty_frac), replace=False ) idxs = np.sort(np.concatenate((faulty_idxs, frac_run_idxs), axis=0)) X = X[idxs,] Y = Y[idxs,] if multiclass: multiclass_labels = { "IGBT Fault": ["IGBT"], "CB or TPS Fault": ["CB", "CapBank", "TPS"], "Driver Fault": ["Driver"], "Flux Fault": ["Flux", "FLUX"], "DV/DT Fault": ["DV/DT"], "SCR AC Input Fault": ["SCR"], } # Rename cadidates to class labels for label, options in multiclass_labels.items(): for option in options: Y[ np.argwhere(np.char.find(Y[:, -1], option) != -1).flatten(), 1 ] = label # Change all other failures to unknown for i in range(Y.shape[0]): unknown = True for label in ["Normal"] + list(multiclass_labels.keys()): if Y[i, 1] == label: unknown = False break if unknown: Y[i, 1] = "Misc/Unknown" Y = Y[:, [int(multiclass)]] if propagate_output: Y = np.transpose( np.broadcast_to(Y[..., None], Y.shape + (X.shape[1],)), axes=(0, 2, 1) ) input_col_names = [ "A+IGBT-I: current", "A+*IGBT-I: current", "B+IGBT-I: current", "B+*IGBT-I: current", "C+IGBT-I: current", "C+*IGBT-I: current", "A-Flux", "B-Flux", "C-Flux", "Mod-V", "Mod-I", "CB-I", "CB-V", "DV/DT", ] # If stack_series we combine samples and time steps dimensions if stack_series: if propagate_output is False: raise RuntimeError("propagate_output must be True if stack_series is True") X = X.reshape((-1, X.shape[-1])) Y = Y.reshape((-1, Y.shape[-1])) inputs = xr.DataArray( X, coords={ "time steps": np.arange(X.shape[0]), "features": input_col_names, }, ) outputs = xr.DataArray( Y, coords={ "time steps": np.arange(Y.shape[0]), "features": ["Class"], }, ) return inputs, outputs else: inputs = xr.DataArray( X, coords={ "samples": np.arange(X.shape[0]), "time steps": np.arange(X.shape[1]), "features": input_col_names, }, ) outputs = xr.DataArray( Y, coords=( { "samples": np.arange(Y.shape[0]), "time steps": np.arange(Y.shape[1]), "features": ["Class"], } if propagate_output else { "samples": np.arange(Y.shape[0]), "features": ["Class"], } ), ) return inputs, outputs