%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% team_training_code
%%% Train ECG leads and obtain classifier models
%%% for 12-lead, 6-leads, 3-leads and 2-leads ECG sets
%%%
%%% Inputs:
%%%  input_directory
%%%  output_directory
%%%
%%% Outputs:
%%%  model - trained model
%%%
%%% Author:  Santiago Jiménez-Serrano [sanjiser@upv.es]
%%% Version: 1.0
%%% Date:    2020-03-26
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


function  model = team_training_code(input_directory, output_directory)

	%%%% NEW CODE
	% Define lead sets (e.g 12, 6, 4, 3 and 2 lead ECG sets)
	twelve_leads = [{'I'}, {'II'}, {'III'}, {'aVR'}, {'aVL'}, {'aVF'}, {'V1'}, {'V2'}, {'V3'}, {'V4'}, {'V5'}, {'V6'}];
	six_leads    = [{'I'}, {'II'}, {'III'}, {'aVR'}, {'aVL'}, {'aVF'}];
	four_leads   = [{'I'}, {'II'}, {'III'}, {'V2'}];
	three_leads  = [{'I'}, {'II'}, {'V2'}];
	two_leads    = [{'I'}, {'II'}];
	lead_sets    = {twelve_leads, six_leads, four_leads, three_leads, two_leads};
	%%%% END NEW CODE


    % Get the file names
    [input_files, num_files] = getInputFiles(input_directory);
	
    num_files
	%% Filter data????
    %num_files = 7143
	%input_files = input_files(1:num_files);
	%% End Filter data

    % Extract classes from dataset    
    [classes, num_classes] = get_classes(input_directory, input_files);
    
    
    debug_step   = int32(100);
    
    nfeatures_x_lead=92+17;
    num_features = 2 + (nfeatures_x_lead*12);
    %num_features = 85;
    
    
    labels   = zeros(num_files, num_classes);
    features = zeros(num_files, num_features);
    
    %%
    fprintf('\nReading data and Getting Features...\n')
    


    %% Load data recordings and header files
    % Iterate over files.
    parfor i = 1 : num_files
    %for i = 1 : num_files

        % Debug
        if mod(i, debug_step) == 0
            fprintf('Loading & Featuring    %5d/%5d ...\n', i, num_files);
            %pack;
        end    

        % Load data & Extract features
        [features(i,:), labels(i, :)] = read_features(input_directory, input_files{i}, classes);
        
    end

    
    %% Outliers filtering
    [features, muOld, sgOld, muNew, sgNew, totFiltered] = FiltradoOutliers(features); % Esto debe considerarse en el modelo final

    
    %% NaNs filtering
    [features, medianas] = FiltradoNaN(features);
    
    
    %% Plot the Boxplots
    %%plotBx(dataset, class);
    
    %% Apply z-Score
    [features, mu, sigma] = ApplyZScore(features);



    

    %% train the models

    % Read the first header to get the lead indexes
    [~, header_data] = read_sample(input_directory, input_files{1});
    warning('off');
    
    
    for i=1:length(lead_sets)

        % Train ECG model
        num_leads = length(lead_sets{i});
        disp(['Training ',num2str(num_leads),'-lead ECG model...'])

        [leads, leads_idx] = get_leads(header_data, num_leads);
        leads_idx
        
        % Get the feature indexes
        Features_leads_idx = get_features_idx(leads_idx, nfeatures_x_lead);
        Features_leads     = features(:, Features_leads_idx);

        % Train the model
        nets = train_whole_nn(Features_leads, labels, classes);
        save_ECGleads_model(num_leads, nets, mu(Features_leads_idx), sigma(Features_leads_idx), medianas(Features_leads_idx), output_directory, classes);
    end
    
    
    % The same model apply for 6, 3 and 2 leads - FOR DEBUG PURPOSES
    %save_ECGleads_model(12, nets, mu, sigma, medianas, output_directory, classes);
    %save_ECGleads_model( 6, nets, mu, sigma, medianas, output_directory, classes);
    %save_ECGleads_model( 4, nets, mu, sigma, medianas, output_directory, classes);
    %save_ECGleads_model( 3, nets, mu, sigma, medianas, output_directory, classes);
	%save_ECGleads_model( 2, nets, mu, sigma, medianas, output_directory, classes);
    
    

    model = nets;

    warning('on');
    
end

%% 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Get Input files
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [input_files, nfiles] = getInputFiles(input_directory)

    fprintf('\n Getting training file names ...\n');
    
    list = dir(input_directory)';
    nfiles = 0;
    
    % 1 - Read the number of valid files
    for f = [ list ]
        if exist(fullfile(input_directory, f.name), 'file') == 2 && ...
           f.name(1) ~= '.' && ...
           all(f.name(end - 2 : end) == 'mat')
            nfiles = nfiles + 1;
        end
    end
    
    % Initialize cell of input file paths
    input_files = cell(nfiles, 1);
    
    % Current file index
    filei = 1;
    
    for f = [ list ]
        if exist(fullfile(input_directory, f.name), 'file') == 2 && ...
           f.name(1) ~= '.' && ...
           all(f.name(end - 2 : end) == 'mat')
            input_files{filei} = f.name;
            filei = filei + 1;
        end
    end

end



%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Feature Extraction from one given sample
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function [features, labels] = read_features(input_directory, input_file, classes)
% Load data recordings and header files
    
    % Load data
    [data, header_data] = read_sample(input_directory, input_file);

    % Check the number of available ECG leads
    tmp_hea   = strsplit(header_data{1},' ');
    num_leads = str2num(tmp_hea{2});
    [~, leads_idx] = get_leads(header_data, num_leads);

    % Extract features
    features = get_features(data, header_data, leads_idx);
    
    
    % Vector that will containt a 1 if the sample belongs to such class
    labels = zeros(1, length(classes));
    
    %% Extract labels
    for j = 1 : length(header_data)
        if startsWith(header_data{j},'#Dx')
            tmp = strsplit(header_data{j},': ');
            % Extract more than one label if avialable
            Dx_classes = strsplit(tmp{2},',');
            for k=1:length(Dx_classes)
                idx=find(strcmp(classes, Dx_classes{k}));
                labels(idx)=1;
            end
            break
        end
    end
    
    %labels

end


function [data, header_data] = read_sample(input_directory, input_file)
% Load data recordings and header files
    
    % Load data
    file_tmp            = strsplit(input_file,'.');
    tmp_input_file      = fullfile(input_directory, file_tmp{1});
    [data, header_data] = load_challenge_data(tmp_input_file);
end


%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Features Index - Utils
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function features_idx = get_features_idx(leads_idx, num_features_x_lead)
    
    
    num_features       = 2 + (num_features_x_lead*12);
    features_idx = zeros(1, num_features);
    
    % 2 first features always are the age and sex
    features_idx(1) = 1;
    features_idx(2) = 1;
    
    for i = [leads_idx]

        % Get the start and end feature index
        start_idx = 3 + ((i-1)*num_features_x_lead);
        end_idx   = start_idx + num_features_x_lead - 1;

        % Append features
        features_idx(start_idx:end_idx) = 1;
    end    
    
    % Mask 2 index array
    features_idx = find(features_idx);
end


%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Model Saving
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function save_ECGleads_model(num_leads, model, mu, sigma, medianas, output_directory, classes) %save_ECG_model
% Save results
	modelname = [num2str(num_leads),'_lead_ecg_model.mat'];
    filename  = fullfile(output_directory, modelname);
    save(filename, 'model', 'mu', 'sigma', 'medianas', 'classes', '-v7.3');    
    disp(['Save Model ',  filename,' -> Done']);
end


%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Dataset/Features Saving
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function save_ECGleads_features(features, output_directory)
% Save features
    filename=fullfile(output_directory, 'features.mat');
    save(filename, 'features');
    disp('Save Features -> DONE')
end


%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% File Utilities
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [classes, num_classes] = get_classes(input_directory, files)
% find unique number of classes

    fprintf('\n Getting unique class identifiers ...\n');

    classes={};
    
    num_files = length(files);
    k=1;
    
    % For each file...
    for i = 1:num_files
        
        g = strrep(files{i},'.mat','.hea');
        input_file = fullfile(input_directory, g);
        fid=fopen(input_file);
        tline  = fgetl(fid);
        tlines = cell(0,1);

        while ischar(tline)
            tlines{end+1,1} = tline;
            tline = fgetl(fid);
            if startsWith(tline,'#Dx')
                tmp   = strsplit(tline,': ');
                tmp_c = strsplit(tmp{2},',');
                for j=1:length(tmp_c)
                    idx2 = find(strcmp(classes,tmp_c{j}));
                    if isempty(idx2)
                        classes{k}=tmp_c{j};
                        k=k+1;
                    end
                end
                break
            end
        end

        fclose(fid);
    end
    
    classes     = sort(classes);
    num_classes = length(classes);
    
end

function [data, tlines] = load_challenge_data(filename)

    % Opening header file
    fid=fopen([filename '.hea']);

    if (fid<=0)
        disp(['error in opening file ' filename]);
    end

    tline  = fgetl(fid);
    tlines = cell(0,1);
    while ischar(tline)
        tlines{end+1,1} = tline;
        tline = fgetl(fid);
    end    
    fclose(fid);

    f=load([filename '.mat']);

    try
        data = f.val;
    catch ex
        rethrow(ex);
    end

end


%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Dataset preprocessing/filtering
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


function [dataset, muOld, sgOld, muNew, sgNew, totFiltered] = ...
    FiltradoOutliers(dataset)

    nsg = 3;   
    mu  = nanmean(dataset);
    sg  = nanstd(dataset);
    ic1 = mu - nsg .* sg;
    ic2 = mu + nsg .* sg;
    totFiltered = 0;
    [~, nf] = size(dataset);

    for i = 1 : nf

        % Vector de datos
        v = dataset(:, i);

        % Si es menor que mu, tiene que estar por debajo de la mediana
        signo = v>=mu(i);
        signo(signo==0) = -1; 

        % Índices de los datos a normalizar
        f = find(v <= ic1(i) | v >= ic2(i));

        % Los ponemos al máximo o mínimo correspondiente
        v(f)= mu(i) + (signo(f)) .* nsg *sg(i);

        % Ponemos los datos en su columna
        dataset(:, i) = v;
        
        % Aumentamos el número de muestras filtradas
        totFiltered = totFiltered + length(f);
    end
    
    muOld = mu;
    sgOld = sg;
    
    muNew = nanmean(dataset);
    sgNew = nanstd(dataset);

end

function [dataset, medianas] = FiltradoNaN(dataset)

    [nrows, ncols] = size(dataset);
    medianas = nanmedian(dataset);
    
    
    for i = 1 : ncols
        
        % Get the column values
        x = dataset(:, i);
        
        % Get the indexes where NaN values exist
        nan_idx = isnan(x);
        
        % Check if exist some NaN value
        if sum(nan_idx) > 0            
            
            % Debug
            fprintf('Replacing NaNs in col [%d] => %d NaNs in %d rows (replaced by median: %f) \n' , ...
                i, sum(nan_idx), nrows, medianas(i));
            
            % Set the median values in the NaN
            dataset(nan_idx, i) = medianas(i);
        end
    end
    
end


%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Z-Scoring
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [dataset, mu, sigma] = ApplyZScore(dataset)

    [dataset, mu, sigma] = zscore(dataset);

end



%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Neural Networks training
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [r] = getBestThresholds(net, features, y, class_name)

    % Create the models response vector
    [nsamples, nf] = size(features);    
    yhat = zeros(1, nsamples);

    fprintf('Classifiyng for class %s ... #samples = %d, #features = %d ', class_name, nsamples, nf);
    parfor i = 1 : nsamples
        yhat(i) = net(features(i, :)', 'useGPU', 'no');
    end

    % Get the best threshold
    [th, best_g] = getBestTh(y, yhat);
    fprintf(' best [th, g] = [%.4f, %.4f] \n', th, best_g);
    
    % Set the value into the struct
    r = [th, best_g];
    
end


function [net] = setBestThresholds(net, features, y, class_name)

   
    % Set the value into the struct
    net.userdata.ths = getBestThresholds(net, features, y, class_name);
    
end


function [net] = train_nn_step01(features, y, class_name, layer_size)


    ynegidx = find(y==0);
    yposidx = find(y==1);
    
    cutoff_neg = int32(length(ynegidx)*0.75);
    cutoff_pos = int32(length(yposidx)*0.75);
    
    tridx = [ynegidx(1:cutoff_neg);     yposidx(1:cutoff_pos)];
    teidx = [ynegidx(cutoff_neg+1:end); yposidx(cutoff_pos+1:end)];
    
    % Random permutation
    tridx = tridx(randperm(length(tridx)));
    teidx = teidx(randperm(length(teidx)));
    

    net = feedforwardnet(layer_size, 'trainscg'); % Trainscg is the trainFcn
    
    %net = feedforwardnet(256); % Trainscg is the trainFcn
    % Aquí se debería poner la función de zscore????    
    %net.trainFcn = 'trainscg';
    
    %processFcns = cell(1, 1);
    %processFcns{1} = 'processpca';
    %processFcns{1} = 'mapstd';
    % See
    % https://es.mathworks.com/help/deeplearning/ug/choose-neural-network-input-output-processing-functions.html
    %net.inputs{1}.processFcns = processFcns;
    
    % Configure the [min, max | avg, std] arrays based on the input data
    net = configure(net, features(tridx, :)', y(tridx)');
    %net = configure(net, features', y');
    
    %for i=1:net.numLayers
    %    if strcmp(net.layers{i}.transferFcn, 'tansig')
    %        net.layers{i}.transferFcn = 'elliotsig';
    %        %net.layers{i}.transferFcn = 'logsig';
    %    end
    %end
    
    net.trainParam.showWindow = 0;
    
    % Negative inputs must be -1
    y(y==0) = -1;
    
    % Train the net
    net = train(net, features(tridx, :)', y(tridx)', ...
        'useGPU', 'yes', ...
        'showResources', 'no');
    
    % Get the best Threshold
    net = setBestThresholds(net, features(teidx, :), y(teidx), class_name);
    
end



function [new_features, features_indexes] = train_feature_selection(features, y, class_name)

       
    
    %fprintf('Feature selection class %d \n', class_idx);
    
    % First two columns always set
    features_indexes = [1, 2];
    
    [~, n] = size(features);
    
    nf1 = 0; % #features filtered at stage 1
    nf2 = 0; % #features filtered at stage 2
    
    for i = 3 : n
        %features_indexes = [features_indexes, i];
        [h, p] = ttest2(features(y==1, i), features(y==0, i)); %, 'Alpha', 0.01);
        if h == 1
            features_indexes = [features_indexes, i];
            %fprintf('Feature selected for class %s: Fi=%d, p_value=%f \n', class_name, i, p);
        else
            nf1 = nf1 + 1;
        end        
    end
    
    
    for i = 3 : length(features_indexes)
        for j = i + 1 : length(features_indexes)
            c = corrcoef([features(:, features_indexes(i)), features(:, features_indexes(j))]);
            if c(1, 2) >= 0.9                
                features_indexes(j) = -1;
                nf2 = nf2 +1;
                %fprintf('Feature selection removed corrcoef for class %s: Fi=%d, Fj=%d \n', class_name, features_indexes(i), features_indexes(j));
            end
        end
        
        features_indexes(features_indexes == -1) = [];
    end
    
    
    fprintf('FS [%s]: #features=%d #nf1=%d #nf2=%d #nf=%d\n', class_name, length(features_indexes), nf1, nf2, nf1+nf2);
    
    %%%%feature_indexes = fs_idx(1:40);
    
    new_features = features(:, features_indexes);
    
end



function [net] = train_nn(features, y, class_name)


    % 0 - Perform the feature selection
    [features, features_indexes] = train_feature_selection(features, y, class_name);    
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

    
    %nlayers = [32, 64, 128, 256];
    %nlayers = [18];
    nlayers = [18, 32];
    
    fprintf('Layers: %d: \n', nlayers(1));
    net = train_nn_step01(features, y, class_name, nlayers(1));
    net.userdata.features_idx = features_indexes;
    
    for i = 2 : length(nlayers)
        
        fprintf('Layers: %d: \n', nlayers(i));
        net_aux = train_nn_step01(features, y, class_name, nlayers(i));
        net_aux.userdata.features_idx = features_indexes;
        
        % Compare g values
        if net_aux.userdata.ths(2) > net.userdata.ths(2)
            net = net_aux;
        end
        
    end

end




function [nets] = train_whole_nn(features, labels, classes)

    % Get the number of classes
    [~, nclasses] = size(labels);
    
    % One model for each class
    nets = cell(1, nclasses);
    
    % 1 - Train one model for each class
    for i = 1 : nclasses
        fprintf('Training net %2d/%2d for class: %20s \n', i, nclasses, classes{i});
        
        % Get the response for this model
        y = labels(:, i);
        
        nets{i} = train_nn(features, y, classes{i});
    end
end



%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Threshold selection
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [best_th, best_g] = getBestTh(y, y_hat)

    n = length(y);
    
    best_th = -1.0;
    best_g  =  0.0;
    
    min_th = min(y_hat);
    max_th = max(y_hat);
    step   = 0.005;
    
    for th = min_th : step : max_th
        
        % Count the true positive, true negatives...
        vp = 0;
        fp = 0;
        vn = 0;
        fn = 0;
        
        for i = 1 : n
            
            if y(i) == 1
                if y_hat(i) >= th
                    vp = vp + 1;
                else
                    fn = fn + 1;
                end
                    
            else % y == 0
                
                if y_hat(i) < th 
                    vn = vn + 1;
                else
                    fp = fp + 1;
                end
                
            end
        end
        
                
        % Get sensibility and specificity
        sen = vp / (vp+fn);
        esp = vn / (vn+fp);
        
        % Queremos mejorar el ratio sensibilidad especificidad???
        g = sqrt(sen * esp);
        
        %fprintf('%.3f %.4f %.4f %.4f %.4f %.4f %.4f %.4f \n', th, vp, fp, vn, fn, sen, esp, g);
        
        % Select the best g score
        if g > best_g
            best_g  =  g;
            best_th = th;
        end        
    end
    
    %fprintf('BestTh %.4f Best_g %.4f \n', best_th, best_g);

end

