def download_medmnist( dataset:str, # The name of the MedMNIST dataset (e.g., 'pathmnist', 'bloodmnist', etc.). output_dir:str='.', # The path to the directory where the datasets will be saved. download_only:bool=False, # If True, only download the dataset into the output directory without processing. save_images:bool=True, # If True, save the images into the output directory as .png (2D datasets) or multipage .tiff (3D datasets) files.):
Downloads the specified MedMNIST dataset and saves the training, validation, and test datasets into the specified output directory. Images are saved as .png for 2D data and multi-page .tiff for 3D data, organized into folders named after their labels.
Returns: None, saves images in the specified output directory if save_images is True.
def medmnist2df( train_dataset, # MedMNIST training dataset with images and labels val_dataset:NoneType=None, # (Optional) MedMNIST validation dataset with images and labels test_dataset:NoneType=None, # (Optional) MedMNIST test dataset with images and labels mode:str='RGB', # Mode for PIL Image conversion, e.g., 'RGB', 'L')->(<class'pandas.DataFrame'>, <class'pandas.DataFrame'>, <class'pandas.DataFrame'>): # (df_train, df_val, df_test): DataFrames with columns 'image' and 'label'
Convert MedMNIST datasets to DataFrames, with images as PIL Image objects and labels as DataFrame columns.
Missing datasets (if None) are represented by None in the return tuple.
def download_file( url, # The URL of the file to be downloaded output_dir:str='data', # The directory where the downloaded file will be saved extract:bool=True, # If True, decompresses the file if it's in a compressed formathash:NoneType=None, # Optional: You can add a checksum for integrity verification extract_dir:NoneType=None, # Directory to extract the files to):
Download and optionally decompress a single file using Pooch.
def download_files( urls, # A list of URLs to download output_dir:str='data', # The directory or list of directories where the downloaded files will be saved extract:bool=True, # If True, decompresses the files if they are in a compressed format hash_list:NoneType=None, # Optional: A list of checksums for integrity verification corresponding to each URL extract_dir:NoneType=None, # Directory to extract the files to):
Download and optionally decompress multiple files using Pooch.
def download_dataset( base_url, # The base URL from which the files will be downloaded. expected_checksums, # A dictionary mapping file names to their expected checksums. file_names, # A dictionary mapping task identifiers to file names. output_dir, # The directory where the downloaded files will be saved. processor:NoneType=None, # A function to process the downloaded data.):
Download a dataset using Pooch and save it to the specified output directory.
def download_dataset_from_csv( csv_file, # Path to the CSV file containing file names and checksums. base_url, # The base URL from which the files will be downloaded. output_dir, # The directory where the downloaded files will be saved. processor:NoneType=None, # A function to process the downloaded data. rows:NoneType=None, # Specific row indices to download. If None, download all rows. prepend_mdf5:bool=True, # If True, prepend 'md5:' to the checksums.):
Download a dataset using Pooch and save it to the specified output directory, reading file names and checksums from a CSV file.
# Specify the directory where you want to save the downloaded filesoutput_directory ="./_test_folder"# Define the base URL for the MSD datasetbase_url ='https://s3.ap-northeast-1.wasabisys.com/gigadb-datasets/live/pub/10.5524/100001_101000/100888/'download_dataset_from_csv('./data_examples/FMD_dataset_info.csv', base_url, output_directory, rows=[6])
The dataset has been successfully downloaded and saved to: ./_test_folder
def aics_pipeline( n_images_to_download:int=40, # Number of images to download image_save_dir:NoneType=None, # Directory to save the images col:str='SourceReadPath', # Column name for image paths in the data manifest):
def manifest2csv( signal, # List of paths to signal images target, # List of paths to target images paths:NoneType=None, # List of paths to images train_fraction:float=0.8, # Fraction of data to use for training data_save_path:str='./', # Path to save the CSV files train:str='train.csv', # Name of the training CSV file test:str='test.csv', # Name of the test CSV file identifier:NoneType=None, # Identifier to add to the paths):
def split_dataframe( input_data, # Path to CSV file or DataFrame train_fraction:float=0.7, # Proportion of data to use for the training set valid_fraction:float=0.1, # Proportion of data to use for the validation set split_column:NoneType=None, # Column name that indicates pre-defined split stratify:bool=False, # If True, stratify by split_column during random split add_is_valid:bool=False, # If True, adds 'is_valid' column in the train set to mark validation samples train_path:str='train.csv', # Path to save the training CSV file test_path:str='test.csv', # Path to save the test CSV file valid_path:str='valid.csv', # Path to save the validation CSV file data_save_path:NoneType=None, # Path to save the data files random_seed:NoneType=None, # Random state for reproducibility):
Splits a DataFrame or CSV file into train, test, and optional validation sets.
def add_columns_to_csv( csv_path, # Path to the input CSV file column_data, # Dictionary of column names and values to add. Each value can be a scalar (single value for all rows) or a list matching the number of rows. output_path:NoneType=None, # Path to save the updated CSV file. If None, it overwrites the input CSV file.):
Adds one or more new columns to an existing CSV file.
def build_csv( filenames:Union, # List of file names to process functions:Callable, # One or more functions that take a filename and return a string (e.g., for generating target paths). function_names:Union=None, # Optional column names for the function outputs. If None, function.__name__ is used. output_csv:Union=None, # If provided, saves the full dataframe to this CSV path. split:bool=False, # If True, applies split_dataframe to the generated dataframe. split_kwargs:Optional=None, # Keyword arguments passed to split_dataframe.)->Optional: # Returns the dataframe if split=False. If split=True, returns None (files are saved by split_dataframe).
Create a DataFrame from filenames and one or more transformation functions.