Source code for AID_BC.dataset

# Copyright 2026 IPSL / CNRS / Sorbonne University
# Authors: Kishanthan Kingston
#
# This work is licensed under the Creative Commons
# Attribution-NonCommercial-ShareAlike 4.0 International License.
# To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc-sa/4.0/

import numpy as np
import xarray as xr


[docs] class ClimateDataset: """ Climate dataset preprocessing pipeline. This class: loads ERA5 and CMIP6 dataset, harmonizes coordinates, adds cyclic longitude, interpolates CMIP6 onto ERA5 grid and exposes aligned DataArrays for downstream processing Attributes ---------- era5_data : xarray.DataArray Reference ERA5 variable. cmip6_data : xarray.DataArray Interpolated CMIP6 variable on ERA5 grid. """
[docs] def __init__(self, era5_path, cmip6_path, variable_name, logger=None): """ Initialize the climate dataset handler. Parameters ---------- era5_path : str Path to ERA5 NetCDF file. cmip6_path : str Path to CMIP6 NetCDF file. variable_name : str Name of the climate variable to process. logger : Logger, optional Custom logger instance. """ self.era5_path = era5_path self.cmip6_path = cmip6_path self.variable_name = variable_name self.logger = logger self.era5 = None self.cmip6 = None self.era5_data = None self.cmip6_data = None
[docs] def load(self): """ Load ERA5 and CMIP6 datasets. """ if self.logger: self.logger.info("Loading ERA5 dataset") self.era5 = xr.open_dataset(self.era5_path) if self.logger: self.logger.info("Loading CMIP6 dataset") self.cmip6 = xr.open_dataset(self.cmip6_path) # Check variable existence if self.variable_name not in self.cmip6: raise ValueError( f"Variable '{self.variable_name}' " "not found in CMIP6 dataset" ) if self.variable_name not in self.era5: raise ValueError( f"Variable '{self.variable_name}' " "not found in ERA5 dataset" ) if self.logger: self.logger.success("Datasets loaded successfully")
[docs] def rename_cmip6_coordinates(self): """ Rename CMIP6 coordinates to match ERA5 convention. """ rename_dict = {} # Rename CMIP6 latitude coordinate if it uses the name "lat" if "lat" in self.cmip6.coords: rename_dict["lat"] = "latitude" # Rename CMIP6 longitude coordinate if it uses the name "lon" if "lon" in self.cmip6.coords: rename_dict["lon"] = "longitude" if rename_dict: if self.logger: self.logger.info(f"Renaming CMIP6 coordinates: {rename_dict}") # Rename CMIP6 coordinates to match ERA5 coordinate names self.cmip6 = self.cmip6.rename(rename_dict)
[docs] @staticmethod def make_longitude_cyclic(da): """ Add cyclic longitude point to avoid interpolation artifacts at the dateline. Parameters ---------- da : xarray.DataArray Input data array with a longitude coordinate. Returns ------- da_ext : xarray.DataArray Data array extended with one additional cyclic longitude point. """ # Append the first longitude slice to the end of the data array, # This closes the longitude cycle and helps avoid edge effects. da_ext = xr.concat([da, da.isel(longitude=0)], dim="longitude") # Create the extended longitude coordinate by adding 360 degrees # to the first longitude value. new_lon = np.append(da["longitude"].values, da["longitude"].values[0] + 360) # Assign the updated longitude coordinate to the extended data array da_ext = da_ext.assign_coords(longitude=new_lon) return da_ext
[docs] def interpolate_cmip6(self): """ Interpolate CMIP6 variable onto ERA5 grid. """ if self.logger: self.logger.info("Adding cyclic longitude to CMIP6 data") # Add a cyclic longitude point to the CMIP6 variable before interpolation cmip6_fixed = self.make_longitude_cyclic(self.cmip6[self.variable_name]) if self.logger: self.logger.info("Interpolating CMIP6 onto ERA5 grid") # Interpolate CMIP6 data onto the latitude and longitude grid of ERA5 self.cmip6_data = cmip6_fixed.interp( latitude=self.era5["latitude"], longitude=self.era5["longitude"], method="linear", ) # Store the ERA5 reference variable without interpolation self.era5_data = self.era5[self.variable_name] if self.logger: self.logger.success(f"Interpolation completed: " f"{self.cmip6_data.shape}")
[docs] def prepare(self): """ Run the complete preprocessing pipeline. """ # Load ERA5 and CMIP6 datasets self.load() # Rename CMIP6 coordinates to match ERA5 coordinate names self.rename_cmip6_coordinates() # Interpolate CMIP6 data onto the ERA5 grid self.interpolate_cmip6() if self.logger: self.logger.success("Dataset preparation completed")