clearvars

%% Load global parameters
% load classification parameters
parameterCLS
dx_codes_scores = readmatrix('dx_mapping_scored.csv', 'OutputType', 'string'); % scored diagnoses in plain text
dx_codes_weights = readmatrix('weights.csv', 'OutputType', 'string'); % weights for the scored diagnoses

% many small separate files so unzip them into a local directory to speed
% up reading into memory
source_dir = fullfile('D:', 'tmp', 'preprocessed');
output_dir = fullfile('D:', 'tmp', 'outputs');
labels_dir = fullfile('D:', 'tmp', 'labels');

%% build featureset
if ~exist('featureset', 'var') && ~exist('categoricals', 'var')
    [featureset, categoricals] = build_featureset(source_dir, ...
                                                  leads, ...
                                                  features_numerical, ...
                                                  features_numerical_global, ...
                                                  features_categorical, ...
                                                  features_categorical_global);
end

%% transform outputs into matrix
y_labels = cell(size(featureset, 1), 1);
for i=1:length(y_labels)
    y_labels{i} = split(featureset{i, end}{1}, ',');
end

scored_labels = dx_codes_scores(:,2);
% sort by number of occurrences
[B, I] = sort(double(dx_codes_scores(:, end-1)), 1);
scored_labels = scored_labels(I);

Y = zeros(size(featureset, 1), length(scored_labels));
for i=1:size(Y, 2) % iterate over diagnoses     
    for j=1:size(Y, 1) % iterate over records
        Y(j, i) = sum(y_labels{j} == scored_labels(i), 'all') > 0;
    end
end

%% prepare input data for training
X = table2array(featureset(:, 1:end-1));
[X, rpl] = impute_missing_values(X, "median");
[X, means, stds] = center_and_scale(X);
var_names = featureset.Properties.VariableNames(1:end-1);

%% dimensionality reduction using PCA
if dte_parameters.perform_pca
    lead_data = repmat((1:56) + 2, 12, 1);
    lead_idx = repmat(((0:11) * 58).', 1, size(lead_data, 2));
    pca_cols = [reshape((lead_data + lead_idx).', 1, []), 697];
    [coeff, score, latent] = pca(X(:, pca_cols));
    % choose number of principal components
    perc_var_explained = 80; % percentage of explained variance by choosing k principle components
    k = find((cumsum(latent) / sum(latent)) >= perc_var_explained/100);
    k = k(1);
    X = [score(:, 1:k), table2array(featureset(:, 1:2)), table2array(featureset(:, [374, 375]))];
    var_names = cell(size(X, 2), 1);
    for i=1:k
        var_names{i} = ['PC_' num2str(i)];
    end
    var_names{k+1} = featureset.Properties.VariableNames{1};
    var_names{k+2} = featureset.Properties.VariableNames{2};
    var_names{k+3} = featureset.Properties.VariableNames{374};
    var_names{k+4} = featureset.Properties.VariableNames{375};
end

%% train classifier (with kfold-crossvalidation)
parameterCLS
dte_parameters.stratified_kfolds = 3;
rng(1992); % control random number generator for cvp generation
if ~exist('classifier', 'var')
    classifier = fit_boosted_decision_tree_ensemble(X, Y, ...
                                                    scored_labels, ...
                                                    true, ...
                                                    categoricals, ...
                                                    var_names, ...
                                                    dte_parameters);
end

%% evaluate classifier
% X = featureset(:, 1:end-1);
[predictions, scores] = predict_boosted_decision_tree_ensemble(classifier, X, scored_labels, var_names);

%% train classifier second stage (with kfold-crossvalidation)
cls_second_parameters.stratified_kfolds = 3;
rng(2018); % control random number generator for cvp generation
classifier_second = fit_logistic_regression(scores(:, :, 1), Y, ...
                                            scored_labels, ...
                                            true, ...
                                            cls_second_parameters);
% classifier_second = fit_classifier_second_stage(scores, Y, ...
%                                                 scored_labels, ...
%                                                 cls_second_parameters);
%%
[predictions_2, scores_2] = predict_logistic_regression(classifier_second, scores(:, :, 1), scored_labels);                     

%% threshold tuning


%% save predictions to file and evaluate with official scoring function
preds = predictions_2;
scs = scores_2;

msg = '';
for i=1:size(X, 1)
    fprintf(repmat('\b', 1, numel(msg)-1));
    msg = [num2str(i/size(preds,1)*100) ' %%'];
    fprintf(msg);
    save_challenge_predictions(output_dir, ...
                               featureset.Properties.RowNames{i}, ...
                               scs(i, :, 2), ...
                               preds(i, :), ...
                               scored_labels)
    save_challenge_labels(labels_dir, ...
                          featureset.Properties.RowNames{i}, ...
                          featureset{i,end}{1})
end

% make sure python and numpy package are installed
cd 'eval\evaluation-2020';
cmd = 'python evaluate_12ECG_score.py D:\tmp\labels D:\tmp\outputs';
system(cmd);
cd '..\..'

%% plots
% confusion matrices
figure();
cm = struct();
for i=1:size(predictions, 2)
    cm(i).C = confusionmat(Y(:,i), predictions(:,i));
    cm(i).dx = scored_labels{i};
    subplot(5, 6, i);
    chart = confusionchart(cm(i).C, {'0', '1'});
    chart.RowSummary = 'row-normalized';
    chart.ColumnSummary = 'column-normalized';
    title(cm(i).dx);
end
