#!/usr/bin/env python
from collections import Counter

import numpy as np, os, sys, joblib
from scipy.io import loadmat
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from imblearn.over_sampling import SMOTE
from sklearn.multioutput import MultiOutputClassifier
import constants
import get_12ECG_features
from sklearn import metrics
import fnmatch
from scipy.io import loadmat
import numpy as np
import xgboost as xgb
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
from imblearn.ensemble import BalancedBaggingClassifier,RUSBoostClassifier

# Load challenge data.
def load_challenge_data(header_file):
    with open(header_file, 'r') as f:
        header = f.readlines()
    mat_file = header_file.replace('.hea', '.mat')
    x = loadmat(mat_file)
    recording = np.asarray(x['val'], dtype=np.float64)
    return recording, header

# Find unique classes.
def get_classes(input_directory, filenames):
    classes = set()
    for filename in filenames:
        with open(filename, 'r') as f:
            for l in f:
                if l.startswith('#Dx'):
                    tmp = l.split(': ')[1].split(',')
                    for c in tmp:
                        classes.add(c.strip())
    return sorted(classes)

input_directory = 'G:\\CincCompetition\\Data\\RawData\\Combine\\'
output_directory = 'G:\\CincCompetition\\Logs\\'
#input_directory = '/media/ajzheng/New Volume1/CincCompetition/Data/RawData/Combine1/'
#input_directory = 'G:\\CincCompetition\\Data\\RawData\\PhysioNetChallenge2020_Training_2\Training_2\\'
header_files = []
InformationFileName = fnmatch.filter(os.listdir(input_directory),'*.hea')
header_files = InformationFileName
num_files = len(header_files)

######Get labels for level 1
recordingsLevel1 = list()
headersLevel1 = list()
level1_labels = list()

for i in range(num_files):
    recording, header = load_challenge_data(input_directory+header_files[i])
    num_leads, ptID, gender, age, sample_Fs, lead_info, classes = get_12ECG_features.parse_hea_file(header)
    if(np.sum(get_12ECG_features.get_target_classes(classes)) == 0):
        print(ptID)
        continue
    if(len(lead_info) != 12):
        continue
    #level1_labels.append(get_12ECG_features.get_target_classes_NA(classes))
    #recordingsLevel1.append(recording[[1,6,7,8,9,10,11],:])
    recordingsLevel1.append(recording)
    headersLevel1.append(header)

features, labels = get_12ECG_features.GenerateEngFeature(recordingsLevel1, headersLevel1, 2, 1, 20)

IntLabels = np.array([], dtype=int)
for i in range(labels.shape[0]):
    IntLabels = np.append(IntLabels, labels[i,:].dot(2**np.arange(labels[i,:].size)[::-1]))



#for i in range(2962,len(recordingsLevel1)):
#    print(i)
#    RpeaksInfo = biosppy.signals.ecg.ecg(recordingsLevel1[i][0, :], 500, show=False)

level1_labelsarray = np.asarray(level1_labels)
idx = np.argwhere(np.all(level1_labelsarray[..., :] == 0, axis=0))
level1_labelsarray = np.delete(level1_labelsarray, idx, axis=1)

#unique_rows = np.unique(level1_labelsarray, axis=0)
level1LabelDF = pd.DataFrame(level1_labelsarray)
level1LabelDF.sum()
gDF = level1LabelDF.groupby(level1LabelDF.columns.tolist(),as_index=False).size()
gDF.sort_values(ascending=False)[0:40]


features, labels = get_12ECG_features.GenerateEngFeature(recordingsLevel1, headersLevel1, 3, 20)
idx = np.argwhere(np.all(labels[..., :] == 0, axis=0))
labels = np.delete(labels, idx, axis=1)

np.save('G:\\CincCompetition\\Data\\RawData\\featuresLevelNA.pkl', features)
np.save('G:\\CincCompetition\\Data\\RawData\\labelsLevelNA.pkl', labels)

#features =np.load('G:\\CincCompetition\\Data\\RawData\\featuresLevel1.pkl.npy')
#labels =np.load('G:\\CincCompetition\\Data\\RawData\\labelsLevel1.pkl.npy')

IntLabels = np.array([], dtype=int)
for i in range(labels.shape[0]):
    IntLabels = np.append(IntLabels, labels[i,:].dot(2**np.arange(labels[i,:].size)[::-1]))

IntLabels[IntLabels>0]=1

# CounterArray = pd.Series(Counter(IntLabels))
# LabelsOver100 = CounterArray.loc[CounterArray >=100].index.to_list()
# idxOver100 = np.argwhere(np.isin(IntLabels, LabelsOver100))
#
# Over100Labels = IntLabels[idxOver100][:,0]
# Over100Features = features[idxOver100[:,0],:]


classifierET = ExtraTreesClassifier(n_estimators=500, criterion='entropy', bootstrap=False, max_features='sqrt', n_jobs=20)

classiferXgb = xgb.XGBClassifier(max_depth=100, objective='binary:logistic', tree_method='approx', scale_pos_weight=1,
                              grow_policy='depthwise', learning_rate=0.01, n_estimators=500, n_jobs=20)

classifierRU = RUSBoostClassifier(base_estimator=classifierET)
sm = SMOTE(random_state=10, sampling_strategy='auto', n_jobs=-1)
skf = StratifiedShuffleSplit(n_splits=1)
for train_index, test_index in skf.split(features, labels):
    print('Training Size and Testingt Size:', train_index.shape, test_index.shape)
    x_train, x_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    #RUfit = classifierRU.fit(x_train, y_train)
    x_train_sm, y_train_sm = sm.fit_resample(x_train, y_train)  # sm.fit_resample(x_train, y_train)
    #fits.append(classifer.fit(x_train_sm, y_train_sm))
    # for idx, cl in enumerate(y_test_tranpose):
    #x_train_sm, y_train_sm = sm.fit_resample(x_train, y_train)  # sm.fit_resample(x_train, y_train)
    ETfit = classifierET.fit(x_train_sm, y_train_sm)
    print(metrics.classification_report(y_test, ETfit.predict(x_test)))
    XgbFit = classiferXgb.fit(x_train_sm, y_train_sm)
    print(metrics.classification_report(y_test, XgbFit.predict(x_test)))



    print(metrics.classification_report(y_test, classifierRU.predict(x_test)))


skf = StratifiedShuffleSplit(n_splits=1)
from imblearn.combine import SMOTEENN
sm = SMOTE(random_state=10, sampling_strategy='auto', n_jobs=-1)
smnn = SMOTEENN(random_state=44, n_jobs=20)
recordings = None
headers = None
fits = []
for cl in labels.transpose():
    for train_index, test_index in skf.split(features, cl):
        print('Training Size and Testingt Size:', train_index.shape, test_index.shape)
        x_train, x_test = features[train_index], features[test_index]
        y_train, y_test = cl[train_index], cl[test_index]

        # y_train_tranpose = y_train.transpose()
        # y_test_tranpose = y_test.transpose()

        # for idx, cl in enumerate(y_train_tranpose):
        x_train_sm, y_train_sm = smnn.fit_resample(x_train, y_train)#sm.fit_resample(x_train, y_train)
        fits.append(classifer.fit(x_train_sm, y_train_sm))

        # for idx, cl in enumerate(y_test_tranpose):
        # x_test_sm, y_test_sm = sm.fit_resample(x_test, cl)
        fits[-1].score(x_test, y_test)
        print(metrics.classification_report(y_test, fits[-1].predict(x_test)))



def train_12ECG_classifier(input_directory, output_directory):
    # Load data.
    print('Loading data...')

    header_files = []
    for f in os.listdir(input_directory):
        g = os.path.join(input_directory, f)
        if not f.lower().startswith('.') and f.lower().endswith('hea') and os.path.isfile(g):
            header_files.append(g)

    # classes = get_classes(input_directory, header_files)
    # num_classes = len(classes)
    num_files = len(header_files)
    recordings = list()
    headers = list()

    for i in range(num_files):
        recording, header = load_challenge_data(header_files[i])
        num_leads, ptID, gender, age, sample_Fs, lead_info, classes = get_12ECG_features.parse_hea_file(header)
        if(np.sum(get_12ECG_features.get_target_classes(classes)) == 0):
            continue
        if(len(lead_info) != 12):
            continue
        recordings.append(recording)
        headers.append(header)

    features, labels = get_12ECG_features.GenerateEngFeature(recordings, headers, constants.NUM_CORES)

    # for ind, cl in enumerate(np.array(labels).transpose()):
    classifer = xgb.XGBClassifier(max_depth=200, objective='binary:logistic', tree_method='approx', scale_pos_weight=1,
                      grow_policy='depthwise', learning_rate=0.01, n_estimators=500, n_jobs=-1)

    # smote = SMOTE('auto', n_jobs=-1)
    # from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE

    fits = []
    sm = SMOTE(random_state=10)
    for cl in labels.transpose():
        Train_X, Train_Y= sm.fit_resample(features,  cl)
        fits.append(classifer.fit(Train_X, Train_Y))

    filename = os.path.join(output_directory, 'xgb_classifier.sav')
    joblib.dump( {
        'xgb_binary_classifier' : fits}, filename, protocol=0)

    # for index, ff in enumerate(fits):
    #     ff.save_model('xgb_classifier_' + index + '.sav')

    # for trainIndex, testIndex in skf.split(FeatureArrayList[ifeatureset], IntLabelListForAllPVCs[ilabelist]):
    #     print('Training Size and Testingt Size:', trainIndex.shape, testIndex.shape, num_validation)
    #     for ismote in range(2):
    #         if ismote == 0:
    #             Train_x, Train_y = FeatureArrayList[ifeatureset][trainIndex], IntLabelListForAllPVCs[ilabelist][
    #                 trainIndex]
    #         else:
    #             Train_x, Train_y = smote.fit_resample(FeatureArrayList[ifeatureset][trainIndex],
    #                                                   IntLabelListForAllPVCs[ilabelist][trainIndex])
    #         # ModelFitList =[] Parallel(n_jobs=5)(delayed(FitModel)(clf, Train_x, Train_y) for clf in Classifiers)
    #         Test_x, Test_y = FeatureArrayList[ifeatureset][testIndex], IntLabelListForAllPVCs[ilabelist][testIndex]
    #         for imodel in range(len(Classifiers)):
    #             clfFit = Classifiers[imodel].fit(Train_x, Train_y)
    # #Train_X is feature, this np.array dim=2 row is observation, col is variable (126006)
    # #Train_y is label, [1,2,3,4,5] #
    # skf = StratifiedShuffleSplit(n_splits=10, test_size=0.1)
    # num_validation = 0
    # # TotalF1SMOTE =0
    # # TotalF1NonSMOTE=0
    # # TotalAccuraySMOT =0
    # # TotalAccurayNonSMOT=0

    # #One model for level 1 labels
    # #The other model for level 2 labels
    #
    # print(metrics.classification_report(Test_y, clfFit.predict(Test_x)))
    # print(metrics.confusion_matrix(Test_y, clfFit.predict(Test_x)))
    # print(metrics.f1_score(Test_y, clfFit.predict(Test_x), average='weighted'))
    # print(metrics.accuracy_score(Test_y, clfFit.predict(Test_x)))
    #
    #
    # dtrain = xgb.DMatrix(features, label=labels)
    # bst = xgb.train(param, dtrain, 5, evallist)
    #
    #
    # # Train model.
    # print('Training model...')
    #
    #
    # features = list()
    # labels = list()
    #
    # for i in range(num_files):
    #     recording = recordings[i]
    #     header = headers[i]
    #
    #
    #     tmp = get_12ECG_features(recording, header)
    #     features.append(tmp)
    #
    #     for l in header:
    #         if l.startswith('#Dx:'):
    #             labels_act = np.zeros(num_classes)
    #             arrs = l.strip().split(' ')
    #             for arr in arrs[1].split(','):
    #                 class_index = classes.index(arr.rstrip()) # Only use first positive index
    #                 labels_act[class_index] = 1
    #     labels.append(labels_act)
    #
    # features = np.array(features)
    # labels = np.array(labels)
    #
    # # Replace NaN values with mean values
    # imputer=SimpleImputer().fit(features)
    # features=imputer.transform(features)
    #
    # # Train the classifier
    # model = RandomForestClassifier().fit(features,labels)
    #
    # # Save model.
    # print('Saving model...')
    #
    # final_model={'model':model, 'imputer':imputer,'classes':classes}
    #
    # filename = os.path.join(output_directory, 'finalized_model.sav')
    # joblib.dump(final_model, filename, protocol=0)

