Leave One Out Cross-Validation (tutorial) ========================================= This tutorial shows how to use ScanOMetrics to cross-validate a normative model and dataset, using Leave-One-Out Cross Validation (LOOCV). As an example, we use the OASIS3 dataset, organized following BIDS guidelines. Filtering healthy subjects -------------------------- The following code assumes you are in the 'code' folder from the OASIS bids directory. It loads the OASIS3 subjects information, including their diagnosis-subtype, which we use to filter out healthy subjects for which all scans have a CDR = 0: :: from scanometrics.utils.logging import set_verbose from scanometrics.core import ScanOMetrics_project import numpy as np from multiprocessing import Process, Pool import os import pickle from itertools import repeat from time import sleep # Define mapping of categorical covariates to numerical values cov2float = {'sex': {'F': 1, 'M': 0}, 'diagnosis_subtype': {'CDR-0': 0, 'CDR-0.5': 1, 'CDR-1': 2, 'CDR-2': 3, 'CDR-3': 4}, 'scanner_type': {'TrioTim': 0, 'Avanto': 1, 'Sonata': 2, 'Biograph_mMR': 3}, 'scanner': {'OASIS-21926': 0, 'OASIS-26321': 1, 'OASIS-35177': 2, 'OASIS-35248': 3, 'OASIS-51010': 4, 'OASIS-NA': 5}, 'sequence': {'': 0, 'MPRAGE_GRAPPA2': 1, 'Sagittal_3D_Accelerated_T1': 2, 't1_mpr_1mm_p2': 3, 't1_mpr_1mm_p2_pos50': 4, 't1_mpr_ns_sag': 5, 't1_mprage_sag_isoWU': 6} } # Load subjects and extract list of healthy subject IDs bids_directory = '..' SOM = ScanOMetrics_project(bids_directory, proc_pipeline='dldirect', cov2float=cov2float.copy(), n_threads=1, atlas='DesikanKilliany') SOM.load_subjects() HC_subj_ids = [] for subj_id in SOM.subject.keys(): is_HC = True for ses_id in SOM.subject[subj_id].keys(): for acq_id in SOM.subject[subj_id][ses_id].keys(): if SOM.subject[subj_id][ses_id][acq_id]['diagnosis_subtype'] != 0: is_HC = False if is_HC: HC_subj_ids.append(subj_id) Then the following code defines the LOOCV function: :: # Define LOOCV analysis for a single subject def run_loocv(subject_to_exclude, HC_list, bids_folder): SOM_in = ScanOMetrics_project(shared_bids_folder, 'dldirect', acq_pattern='*T1w', dataset_id='OASIS3_dldirect_LOOCV-exclude-%s' % subject_to_exclude, cov2float=cov2float.copy(), n_threads=1, atlas='DesikanKilliany') LOO_HC_list = HC_list.copy() LOO_HC_list.remove(subject_to_exclude) SOM_in.load_subjects(subjects_include=LOO_HC_list) SOM_in.load_proc_metrics() SOM_in.set_normative_model('Polynomial') SOM_in.normativeModel.flag_outliers(1.5, 'measured_metrics') SOM_in.normativeModel.uncertainty = {} for k in SOM_in.measured_metrics.keys(): SOM_in.normativeModel.uncertainty[k] = np.ones(SOM_in.measured_metrics[k].shape[1]) SOM_in.normativeModel.fit(flag_opt=1, N_cycl=100, global_deg_max=22) # Leave default deg_max, frac=20, alpha=0.01 SOM_in.normativeModel.flag_outliers(1.5, 'residuals') SOM_in.normativeModel.compute_uncertainty() SOM_in.normativeModel.fit(flag_opt=1, N_cycl=100) # deg_max is computed depending on number of samples for the fit output_folder = os.path.join(shared_bids_folder, 'derivatives', 'scanometrics', SOM_in.normativeModel.model_dataset_id) if not os.path.exists(output_folder): os.makedirs(output_folder) SOM_in.save_normative_model(output_filename=os.path.join(output_folder, SOM_in.normativeModel.model_dataset_id+'.pkl')) SOM_out = ScanOMetrics_project(shared_bids_folder, 'dldirect', dataset_id='OASIS3_dldirect_LOOCV-exclude-%s' % subject_to_exclude, acq_pattern='*T1w', cov2float=cov2float.copy(), n_threads=1) SOM_out.load_subjects(subjects_include=[subject_to_exclude]) SOM_out.normativeModel = SOM_in.normativeModel SOM_out.load_proc_metrics(ref_metric_values=SOM_out.normativeModel.measured_metrics['orig'].copy(), ref_metric_names=SOM_out.normativeModel.metric_names.copy(), ref_covariate_values=SOM_out.normativeModel.covariate_values.copy(), ref_covariate_names=SOM_out.normativeModel.covariate_names.copy()) evaluation_results = SOM_out.evaluate_singleSubject_allSes(subject_to_exclude, ['sex', 'sequence', 'scanner'], create_html_report=False) output_folder = os.path.join(shared_bids_folder, 'derivatives', 'scanometrics', SOM_out.normativeModel.model_dataset_id, subject_to_exclude) if not os.path.exists(output_folder): os.makedirs(output_folder) with open(os.path.join(output_folder, 'loocv_evaluation_results.pkl'), 'wb') as fout: pickle.dump(evaluation_results, fout, pickle.HIGHEST_PROTOCOL) # Run LOOCV over all healthy subjects, in parallel using n_threads n_threads = 20 with Pool(n_threads) as pool: pool.starmap(run_loocv, zip(HC_subj_ids, repeat(HC_subj_ids, len(HC_subj_ids))), bids_directory) Full script *********** :: from scanometrics.utils.logging import set_verbose from scanometrics.core import ScanOMetrics_project import numpy as np from multiprocessing import Process, Pool import os import pickle from itertools import repeat from time import sleep # Define LOOCV analysis for a single subject def run_loocv(subject_to_exclude, HC_list, bids_folder): SOM_in = ScanOMetrics_project(shared_bids_folder, 'dldirect', acq_pattern='*T1w', dataset_id='OASIS3_dldirect_LOOCV-exclude-%s' % subject_to_exclude, cov2float=cov2float.copy(), n_threads=1, atlas='DesikanKilliany') LOO_HC_list = HC_list.copy() LOO_HC_list.remove(subject_to_exclude) SOM_in.load_subjects(subjects_include=LOO_HC_list) SOM_in.load_proc_metrics() SOM_in.set_normative_model('Polynomial') SOM_in.normativeModel.flag_outliers(1.5, 'measured_metrics') SOM_in.normativeModel.uncertainty = {} for k in SOM_in.measured_metrics.keys(): SOM_in.normativeModel.uncertainty[k] = np.ones(SOM_in.measured_metrics[k].shape[1]) SOM_in.normativeModel.fit(flag_opt=1, N_cycl=100, global_deg_max=22) # Leave default deg_max, frac=20, alpha=0.01 SOM_in.normativeModel.flag_outliers(1.5, 'residuals') SOM_in.normativeModel.compute_uncertainty() SOM_in.normativeModel.fit(flag_opt=1, N_cycl=100) # deg_max is computed depending on number of samples for the fit output_folder = os.path.join(shared_bids_folder, 'derivatives', 'scanometrics', SOM_in.normativeModel.model_dataset_id) if not os.path.exists(output_folder): os.makedirs(output_folder) SOM_in.save_normative_model(output_filename=os.path.join(output_folder, SOM_in.normativeModel.model_dataset_id+'.pkl')) SOM_out = ScanOMetrics_project(shared_bids_folder, 'dldirect', dataset_id='OASIS3_dldirect_LOOCV-exclude-%s' % subject_to_exclude, acq_pattern='*T1w', cov2float=cov2float.copy(), n_threads=1) SOM_out.load_subjects(subjects_include=[subject_to_exclude]) SOM_out.normativeModel = SOM_in.normativeModel SOM_out.load_proc_metrics(ref_metric_values=SOM_out.normativeModel.measured_metrics['orig'].copy(), ref_metric_names=SOM_out.normativeModel.metric_names.copy(), ref_covariate_values=SOM_out.normativeModel.covariate_values.copy(), ref_covariate_names=SOM_out.normativeModel.covariate_names.copy()) evaluation_results = SOM_out.evaluate_singleSubject_allSes(subject_to_exclude, ['sex', 'sequence', 'scanner'], create_html_report=False) output_folder = os.path.join(shared_bids_folder, 'derivatives', 'scanometrics', SOM_out.normativeModel.model_dataset_id, subject_to_exclude) if not os.path.exists(output_folder): os.makedirs(output_folder) with open(os.path.join(output_folder, 'loocv_evaluation_results.pkl'), 'wb') as fout: pickle.dump(evaluation_results, fout, pickle.HIGHEST_PROTOCOL) # Define mapping of categorical covariates to numerical values cov2float = {'sex': {'F': 1, 'M': 0}, 'diagnosis_subtype': {'CDR-0': 0, 'CDR-0.5': 1, 'CDR-1': 2, 'CDR-2': 3, 'CDR-3': 4}, 'scanner_type': {'TrioTim': 0, 'Avanto': 1, 'Sonata': 2, 'Biograph_mMR': 3}, 'scanner': {'OASIS-21926': 0, 'OASIS-26321': 1, 'OASIS-35177': 2, 'OASIS-35248': 3, 'OASIS-51010': 4, 'OASIS-NA': 5}, 'sequence': {'': 0, 'MPRAGE_GRAPPA2': 1, 'Sagittal_3D_Accelerated_T1': 2, 't1_mpr_1mm_p2': 3, 't1_mpr_1mm_p2_pos50': 4, 't1_mpr_ns_sag': 5, 't1_mprage_sag_isoWU': 6} } # Load subjects and extract list of healthy subject IDs bids_directory = '..' SOM = ScanOMetrics_project(bids_directory, proc_pipeline='dldirect', cov2float=cov2float.copy(), n_threads=1, atlas='DesikanKilliany') SOM.load_subjects() HC_subj_ids = [] for subj_id in SOM.subject.keys(): is_HC = True for ses_id in SOM.subject[subj_id].keys(): for acq_id in SOM.subject[subj_id][ses_id].keys(): if SOM.subject[subj_id][ses_id][acq_id]['diagnosis_subtype'] != 0: is_HC = False if is_HC: HC_subj_ids.append(subj_id) # Run LOOCV over all healthy subjects, in parallel using n_threads n_threads = 20 with Pool(n_threads) as pool: pool.starmap(run_loocv, zip(HC_subj_ids, repeat(HC_subj_ids, len(HC_subj_ids))), bids_directory)