#!/usr/bin/env python

# Edit this script to add your team's code. Some functions are *required*, but you can edit most parts of the required functions,
# change or remove non-required functions, and add your own functions.

################################################################################
#
# Import libraries and functions. You can change or remove them.
#
################################################################################

from helper_code import *
import numpy as np, scipy as sp, scipy.stats, os, sys, joblib
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
import librosa
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgbm
from imblearn.over_sampling import SVMSMOTE
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, confusion_matrix, roc_auc_score,accuracy_score
import seaborn as sns 
import matplotlib.pyplot as plt
import params_config
import pickle
from tqdm import tqdm
from scipy import signal
sns.set()

################################################################################
#
# Required functions. Edit these functions to add your code, but do not change the arguments.
#
################################################################################

# Train your model.
def train_challenge_model(data_folder, model_folder, verbose):
    # Find data files.
    if verbose >= 1:
        print('Finding data files...')
    
    # Find the patient data files.
    patient_files = find_patient_files(data_folder)
    num_patient_files = len(patient_files)

    if num_patient_files==0:
        raise Exception('No data was provided.')

    # Create a folder for the model if it does not already exist.
    os.makedirs(model_folder, exist_ok=True)

    # Extract the features and labels.
    if verbose >= 1:
        print('Extracting features and labels from the Challenge data...')

    murmur_classes = ['Present', 'Unknown', 'Absent']
    murmur_dict = {'Present':0, 'Unknown':1, 'Absent':2}
    num_murmur_classes = len(murmur_classes)
    outcome_classes = ['Abnormal', 'Normal']
    outcome_dict = {'Abnormal':0, 'Normal':1}
    num_outcome_classes = len(outcome_classes)

    features = {
        'features':[]
    }
    murmurs = list()
    outcomes = list()

    for i in tqdm(range(num_patient_files) ,total=num_patient_files):
        if verbose >= 2:
            print('    {}/{}...'.format(i+1, num_patient_files))

        # Load the current patient data and recordings.
        current_patient_data = load_patient_data(patient_files[i])
        current_recordings = load_recordings(data_folder, current_patient_data)

        # Extract murmur
        murmur = get_murmur(current_patient_data)
        current_murmur = murmur_dict[murmur]
        # Extract outcome
        outcome = get_outcome(current_patient_data)
        current_outcome = outcome_dict[outcome]
        # Extract features.
        current_features = get_features(current_patient_data, current_recordings)
        for feas in current_features:
            for key in feas:
                if(key!='loc'):
                    features[key].append(feas[key])
            murmurs.append(current_murmur)
            outcomes.append(current_outcome)
    
    murmurs = np.vstack(murmurs)
    outcomes = np.vstack(outcomes)

    # preview features murmurs outcomes
    print(f'murmur:{murmurs.shape}m outcome:{outcomes.shape}')
    for key in features:
        print(f'{key}:{len(features[key])}')
    
    # 填补缺失值
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean').fit(features['features'])
    features['features'] = imputer.transform(features['features'])
    # 数据保存与归一化
    for key in features:
        features[key]= np.hstack((features[key], murmurs, outcomes))
        print(f'{key} : {features[key].shape}')
        df = pd.DataFrame(features[key])
        filename = "model/"+str(key)+".csv"
        df.to_csv(filename,index=False)
        # 正则化
        print("Start normalize .....")
        features[key] = normalize(key,features[key])

    # Train the model.
    if verbose >= 1:
        print('Training Murmur model...')
    # 训练第一个分类器：Unkonwn/Known分类器
    model1 = train_model1(features,model_folder, murmur_classes, imputer)
    # 训练第二个分类器：Absent/Present分类器
    model2 = train_model2(features,model_folder, murmur_classes, imputer)
    # 训练第三个分类器：Absent/Present分类器
    model3 = train_model3(features,model_folder, outcome_classes, imputer)
    # 保存结果
    save_challenge_model(model_folder=model_folder, imputer=imputer,murmur_classes=murmur_classes,outcome_classes=outcome_classes,classifier1=model1,classifier2=model2,classifier3=model3 )

    if verbose >= 1:
        print('Done.')

# Load your trained model. This function is *required*. You should edit this function to add your code, but do *not* change the
# arguments of this function.
def load_challenge_model(model_folder, verbose):
    filename = os.path.join(model_folder, 'model.sav')
    return joblib.load(filename)

# Run your trained model. This function is *required*. You should edit this function to add your code, but do *not* change the
# arguments of this function.
def run_challenge_model(model, data, recordings, verbose):
    imputer = model['imputer']
    murmur_classes = model['murmur_classes']
    murmur_classifier1 = model['classifier1']
    murmur_classifier2 = model['classifier2']
    outcome_classes = model['outcome_classes']
    outcome_classifier = model['classifier3']

    # Load features.
    DATA ={
        'features':[]
    }
    features = get_features(data, recordings)
    for fea in features:
        # 正则化
        fea = test_normalize(fea) 
        DATA['features'].append(fea['features'])

    # Get classifier probabilities.
    murmur_probabilities = predict_murmur(DATA,imputer, murmur_classifier1,murmur_classifier2)
    
    outcome_probabilities = predict_outcome(DATA,imputer,outcome_classifier)

    # Choose label with highest probability.
    murmur_labels = np.zeros(len(murmur_classes), dtype=np.int_)
    idx = np.argmax(murmur_probabilities)
    murmur_labels[idx] = 1
    outcome_labels = np.zeros(len(outcome_classes), dtype=np.int_)
    idx = np.argmax(outcome_probabilities)
    outcome_labels[idx] = 1

    # Concatenate classes, labels, and probabilities.
    classes = murmur_classes + outcome_classes
    labels = np.concatenate((murmur_labels, outcome_labels))
    probabilities = np.concatenate((murmur_probabilities, outcome_probabilities))

    return classes, labels, probabilities

################################################################################
#
# Optional functions. You can change or remove these functions and/or add new functions.
#
################################################################################

# Save your trained model.
def save_challenge_model(model_folder, imputer, murmur_classes, outcome_classes, classifier1,classifier2,classifier3):
    d = {'imputer': imputer, 'murmur_classes': murmur_classes, 'outcome_classes': outcome_classes, 'classifier1': classifier1, 'classifier2': classifier2, 'classifier3': classifier3}
    filename = os.path.join(model_folder, 'model.sav')
    joblib.dump(d, filename, protocol=0)

# Extract features from the data.
def get_features(data, recordings):
    # Extract the age group and replace with the (approximate) number of months for the middle of the age group.

    # Extract recording locations and data. Identify when a location is present, and compute the mean, variance, and skewness of
    # each recording. If there are multiple recordings for one location, then extract features from the last recording.
    locations = get_locations(data)

    features_list = []

    recording_locations = ['AV', 'MV', 'PV', 'TV', 'PhC']
    for i,loc in enumerate(locations):
        fea = extract_from_audio(recordings[i])
        # extract.extend(en_loc.tolist())
        res = {'features':fea,'loc':loc}
        features_list.append(res)

    return features_list

def audio_INT_to_FLOAT(audio):
    '''
    将wavfile读取的音频文件根据数据类型进行转化。整型->浮点型
    '''
    res = librosa.util.buf_to_float(audio,dtype=audio.dtype)
    return res

def band_pass_filter(original_signal, order, fc1,fc2, fs):
    '''
    中值滤波器
    :param original_signal: 音频数据
    :param order: 滤波器阶数
    :param fc1: 截止频率
    :param fc2: 截止频率
    :param fs: 音频采样率
    :return: 滤波后的音频数据
    '''
    b, a = signal.butter(N=order, Wn=[2*fc1/fs,2*fc2/fs], btype='bandpass')
    new_signal = signal.lfilter(b, a, original_signal)
    return new_signal

def extract_from_audio(audio):
    '''
    提取特征
    '''
    fs = 4000
    target_sr = 2000
    audio = audio_INT_to_FLOAT(audio)
    source, index = librosa.effects.trim(audio) # 去除静音
    source = band_pass_filter(source, 2, 1, 400, fs) # 滤波
    source=librosa.util.fix_length(data=source,size=fs*25) # 取25S
    source = librosa.resample(source.astype(np.float32), orig_sr=fs, target_sr=target_sr, fix=True, scale=False)  # 重采样
    fs = target_sr

    stft = np.abs(librosa.stft(source,n_fft=512))
    mfcc = librosa.feature.mfcc(y=source, sr=fs, n_mfcc=15,fmax=500)
    chroma = librosa.feature.chroma_stft(S=stft, sr=fs, n_fft=512)
    mel = librosa.feature.melspectrogram(y=source, sr=fs, n_fft=512,fmax=500)
    zrc = librosa.feature.zero_crossing_rate(source,frame_length=512, hop_length=256)
    spec_cent = np.mean(librosa.feature.spectral_centroid(y=source, sr=fs,n_fft=512) ) # 谱质心
    spec_flat = librosa.feature.spectral_flatness(y=source, n_fft=512).reshape(-1,) # 频谱平坦度
    onset_env = librosa.onset.onset_strength(y=source, sr=fs) # 频谱通量
    rmse = np.mean(librosa.feature.rms(y=source,frame_length=512, hop_length=256)) #均方根能量
    
    mfcc = np.mean(mfcc, axis=1)
    chroma = np.mean(chroma, axis=1)
    mel = np.mean(mel, axis=1)
    zcr = np.mean(zrc)
    lpc = librosa.lpc(source, order=10)
    # res = np.hstack((mfcc, chroma, mel, zcr))
    res = np.hstack((mfcc, zcr, spec_cent, rmse, lpc))
    
    return res

# 正则化
def normalize(key, data):
    # 后1列是标签one-hot
    x = data[:, :-2] 
    murmurs = data[:,-2]
    outcomes = data[:,-1]
    path = 'model/minmax_'+key+".csv"
    scaler = MinMaxScaler()
    scaler.fit(x)
    joblib.dump(scaler, f"model/scaler_{key}.save")
    nor_x = scaler.transform(x)
    df = pd.DataFrame(nor_x)
    df['murmur'] = murmurs
    df['outcome'] = outcomes
    df.to_csv(path, index=False)
    return df 

def test_normalize(features):
    '''
    测试集数据正则化
    '''
    data = dict()
    for key in features:
        if(key!='loc'):
            if type(features[key]) is not np.ndarray:
                features[key] = np.array(features[key])
            if(key=='extract'):
                scaler = joblib.load(f"model/scaler_{key}.save")
                x = features[key].reshape(1,-1)
                x1 = x[:,:3]
                x2 = x[:,3:]
                nor_x1 = scaler.transform(x1)
                data[key] = np.hstack((nor_x1,x2)).reshape(-1,)
            else:
                scaler = joblib.load(f"model/scaler_{key}.save")
                data[key] = scaler.transform(features[key].reshape(1, -1)).reshape(-1,)
    return data

def save_feature_importance(model,file):
    
    importance = model.feature_importance(importance_type='split')
    feature_name = model.feature_name()
    # for (feature_name,importance) in zip(feature_name,importance):
    #     print (feature_name,importance) 
    feature_importance = pd.DataFrame({
    'feature_name':feature_name,'importance':importance} )
    feature_importance = feature_importance.sort_values(by = 'importance',ascending=False)
    feature_importance.to_csv(f'cm/{file}.csv',index=False)

def prepare_model1(features_dict):
    
    '''
    准备分类器1训练数据
    '''
    # features_df = features_dict['features']
    
    features_df = pd.read_csv(f"model/minmax_features.csv")
    
    train_df = features_df.iloc[:,:-1] # 删除outcomes标签 
    # 对标签列重命名
    train_df.rename(columns = {train_df.columns[-1]: "label"},  inplace=True)
    # 标签转换。Unkonwn为0，其他为1, 原始标签为{'Present':0, 'Unknown':1, 'Absent':2}
    train_df['label'] = train_df['label'].apply(lambda x: 0 if x==1 else 1)
    print(train_df['label'].value_counts())
    X, y = train_df.iloc[:, :-1].values, train_df.iloc[:, -1].values
    
    print(X.shape, y.shape)

    smote = SVMSMOTE(sampling_strategy=1, random_state=42)

    return X, y, smote

def train_model1(features_dict,model_folder, classes,imputer):
    '''
    训练已知和未知分类模型
    '''
    X, y, smote = prepare_model1(features_dict)
    
    # params = {"objective": "binary",
    #         "boosting_type": "gbdt",
    #         "metric" : "auc",
    #         'learning_rate': 0.1, 
    #         'n_estimators':502,
    #         "max_depth":7,
    #         "num_leaves":70,
    #         'max_bin': 55, 
    #         'min_data_in_leaf': 51,
    #         'bagging_fraction': 0.9, 
    #         'bagging_freq': 60, 
    #         'feature_fraction': 1.0,
    #         'lambda_l1': 0.0, 
    #         'lambda_l2': 0.0,
    #         'min_split_gain': 0.0,
    #         "verbose":-1
    #         }

    params = {"objective": "binary",
            "boosting_type": "gbdt",
            "metric" : "auc",
            'learning_rate': 0.1, 
            'n_estimators':150,
            "max_depth":5,
            "num_leaves":20,
            'max_bin': 25, 
            'min_data_in_leaf': 27,
            'bagging_fraction': 0.9, 
            'bagging_freq': 30, 
            'feature_fraction': 0.7,
            'lambda_l1': 0.001, 
            'lambda_l2': 0.001,
            'min_split_gain': 0.0,
            "verbose":-1,
            }

    seeds = range(10)
    fold_seed = 42
    n_splits = 10
    best_auc = 0.0
    models = []
    for seed in seeds:

        print("---seed: ", seed)

        kfolds = KFold(n_splits=n_splits, random_state=fold_seed, shuffle=True)
        fig = plt.figure() 
        for f, (trn_idx, val_idx) in enumerate(kfolds.split(X=X, y=y)):

            kf_x_train, kf_y_train = X[trn_idx], y[trn_idx]
            kf_x_train, kf_y_train = smote.fit_resample(kf_x_train, kf_y_train)
            kf_x_valid, kf_y_valid = X[val_idx], y[val_idx]

            
            
            print(kf_x_train.shape, kf_y_train.shape)
            
            dtrain = lgbm.Dataset(kf_x_train, kf_y_train)
            dvalid = lgbm.Dataset(kf_x_valid, kf_y_valid)
            
            params["random_state"] = seed
            # params["num_leaves"] = 500 + seed * 25

            model = lgbm.train(params, 
                              dtrain, 
                            #   num_boost_round=10000,
                            # early_stopping_rounds=100,
                              valid_sets=(dtrain, dvalid), 
                              
                              valid_names=("train", "valid"),
                              verbose_eval =100 # 0
                              )
            pred_valid_p = model.predict(kf_x_valid,num_iteration=model.best_iteration)
            pred_valid=[int(x>=0.5) for x in pred_valid_p]
            
            acc =roc_auc_score(kf_y_valid, pred_valid_p)
            print(f"auc:{acc}")    
            if(acc>best_auc):
                if(best_auc!=0.0):
                    # 混淆矩阵
                    C2= confusion_matrix(kf_y_valid, pred_valid, labels=[0,1])
                    print('\nConfusion Matrix : \n', C2)
#                     fig = sns.heatmap(C2,annot=True,cmap="BuPu",fmt='g',xticklabels=['Unknown','Konwn'],yticklabels=['Unknown','Konwn'])
#                     plt.ylabel('True label')    
#                     plt.xlabel('Predicted label')
#                     scatter_fig = fig.get_figure()
#                     scatter_fig.savefig(f"cm/baseline_figure_model1_{seed + 1}_{f + 1}_{acc}.png", dpi = 400)
#                     plt.close()
                    models.append(model)
                    # f = open(f"modelML/model_{seed + 1}_{f + 1}.pkl", "wb")
                    # pickle.dump(model, f)
                    # f.close()
                    
                best_auc = acc
                print('=' * 80)
    # save_feature_importance(models[-1],'model1')
    return models[-1]

def prepare_model2(features_dict):
    '''
    准备分类器2数据, 对存在与不存在进行分类
    '''
    
    # features_df = features_dict['features']
    
    features_df = pd.read_csv(f"model/minmax_features.csv")
    
    train_df =  features_df.iloc[:,:-1] # 删除outcome标签
    # 对标签列重命名
    train_df.rename(columns = {train_df.columns[-1]: "label"},  inplace=True)
    # 标签转换。Absent为0，Present为1, 原始标签为{'Present':0, 'Unknown':1, 'Absent':2}
    train_df = train_df[train_df['label'].isin([0,2])]
    train_df['label'] = train_df['label'].apply(lambda x: 0 if x==2 else 1)
    print(train_df['label'].value_counts())
    X, y = train_df.iloc[:, :-1].values, train_df.iloc[:, -1].values
    print(X.shape, y.shape)

    smote = SVMSMOTE(sampling_strategy=1, random_state=42)
    return X, y, smote

def train_model2(features_dict,model_folder, classes,imputer):
    '''
    训练分类器2, 对存在和不存在进行分类
    '''
    X, y, smote = prepare_model2(features_dict)

    # params = {"objective": "binary",
    #           "boosting_type": "gbdt",
    #           "metric" : "auc",
    #           'learning_rate': 0.1, 
    #         'n_estimators':1311,
    #         "max_depth":7,
    #         "num_leaves":40,
    #         'max_bin': 15, 
    #         'min_data_in_leaf': 91,
    #         'bagging_fraction': 0.8, 
    #         'bagging_freq': 80, 
    #         'feature_fraction': 1.0,
    #         'lambda_l1': 0.0, 
    #         'lambda_l2': 0.0,
    #         'min_split_gain': 0.0,
    #         "verbose":-1
    #         }

    params = {"objective": "binary",
            "boosting_type": "gbdt",
            "metric" : "auc",
            'learning_rate': 0.1, 
            'n_estimators':189,
            "max_depth":8,
            "num_leaves":30,
            'max_bin': 35, 
            'min_data_in_leaf': 1,
            'bagging_fraction': 0.8, 
            'bagging_freq': 16, 
            'feature_fraction': 0.6,
            'lambda_l1': 0.0, 
            'lambda_l2': 0.0,
            'min_split_gain': 0.0,
            "verbose":-1,
            }

    seeds = range(10)
    fold_seed = 42
    n_splits = 10
    best_auc = 0.0
    
    models = []
    for seed in seeds:

        print("---seed: ", seed)

        kfolds = KFold(n_splits=n_splits, random_state=fold_seed, shuffle=True)
        fig = plt.figure() 
        for f, (trn_idx, val_idx) in enumerate(kfolds.split(X=X, y=y)):

            kf_x_train, kf_y_train = X[trn_idx], y[trn_idx]
            kf_x_train, kf_y_train = smote.fit_resample(kf_x_train, kf_y_train)
            kf_x_valid, kf_y_valid = X[val_idx], y[val_idx]
            
            print(kf_x_train.shape, kf_y_train.shape)
            
            dtrain = lgbm.Dataset(kf_x_train, kf_y_train)
            dvalid = lgbm.Dataset(kf_x_valid, kf_y_valid)
            
            params["random_state"] = seed
            # params["num_leaves"] = 500 + seed * 25

            model = lgbm.train(params, 
                              dtrain, 
                            #   num_boost_round=10000,
                              valid_sets=(dtrain, dvalid), 
                              valid_names=("train", "valid"),
                              verbose_eval = 100 # 0
                              )
            pred_valid_p = model.predict(kf_x_valid,num_iteration=model.best_iteration)
            pred_valid=[int(x>=0.5) for x in pred_valid_p]
            
            acc =roc_auc_score(kf_y_valid, pred_valid_p)
            print(f"auc:{acc}")    
            if(acc>best_auc):
                if(best_auc!=0.0):
                    # 混淆矩阵
                    C2= confusion_matrix(kf_y_valid, pred_valid, labels=[0,1])
                    print('\nConfusion Matrix : \n', C2)
#                     fig = sns.heatmap(C2,annot=True,cmap="BuPu",fmt='g',xticklabels=['Absent','Present'],yticklabels=['Absent','Present'])
#                     plt.ylabel('True label')    
#                     plt.xlabel('Predicted label')
#                     scatter_fig = fig.get_figure()
#                     scatter_fig.savefig(f"cm/baseline_figure_model2_{seed + 1}_{f + 1}_{acc}.png", dpi = 400)
#                     plt.close()
                    # f = open(f"modelML/model_{seed + 1}_{f + 1}.pkl", "wb")
                    # pickle.dump(model, f)
                    # f.close()
                    models.append(model)
                    
                best_auc = acc
                print('=' * 80)
    # save_feature_importance(models[-1],'model2')
    return models[-1]

def prepare_model3(features_dict):
    '''
    准备分类器3数据, 对outcome正常与不正常进行分类
    '''
    # features_df = features_dict['features']
    
    features_df = pd.read_csv(f"model/minmax_features.csv")
    features_df.rename(columns = {features_df.columns[-2]: "murmur", features_df.columns[-1]: "label"},  inplace=True)
    train_df =  features_df.drop(columns=['murmur']) # 删除outcome标签
    # {'Abnormal':0, 'Normal':1}
    print(train_df['label'].value_counts())
    X, y = train_df.iloc[:, :-1].values, train_df.iloc[:, -1].values
    print(X.shape, y.shape)

    smote = SVMSMOTE(sampling_strategy=1, random_state=42)
    return X, y, smote

def train_model3(features_dict,model_folder, classes,imputer):
    '''
    训练分类器3, 对outcome正常与不正常进行分类
    '''
    X, y, smote = prepare_model3(features_dict)

    # params = {"objective": "binary",
    #         "boosting_type": "gbdt",
    #         "metric" : "auc",
    #         'learning_rate': 0.1, 
    #         'n_estimators':1055,
    #         "max_depth":7,
    #         "num_leaves":55,
    #         'max_bin': 245, 
    #         'min_data_in_leaf': 1,
    #         'bagging_fraction': 0.8, 
    #         'bagging_freq': 60, 
    #         'feature_fraction': 0.7,
    #         'lambda_l1': 0.0, 
    #         'lambda_l2': 0.0,
    #         'min_split_gain': 0.0,
    #         "verbose":-1
    #         }

    params = {"objective": "binary",
            "boosting_type": "gbdt",
            "metric" : "auc",
            'learning_rate': 0.1, 
            'n_estimators':47,
            "max_depth":8,
            "num_leaves":17,
            'max_bin': 15, 
            'min_data_in_leaf': 15,
            'bagging_fraction': 0.9, 
            'bagging_freq': 41, 
            'feature_fraction': 0.9,
            'lambda_l1': 1e-05, 
            'lambda_l2': 1e-05,
            'min_split_gain': 0.0,
            "verbose":-1,
            }

    seeds = range(10)
    fold_seed = 42
    n_splits = 10
    best_auc = 0.0
    
    models = []
    for seed in seeds:

        print("---seed: ", seed)

        kfolds = KFold(n_splits=n_splits, random_state=fold_seed, shuffle=True)
        fig = plt.figure() 
        for f, (trn_idx, val_idx) in enumerate(kfolds.split(X=X, y=y)):

            kf_x_train, kf_y_train = X[trn_idx], y[trn_idx]
            kf_x_train, kf_y_train = smote.fit_resample(kf_x_train, kf_y_train)
            kf_x_valid, kf_y_valid = X[val_idx], y[val_idx]
            
            print(kf_x_train.shape, kf_y_train.shape)
            
            dtrain = lgbm.Dataset(kf_x_train, kf_y_train)
            dvalid = lgbm.Dataset(kf_x_valid, kf_y_valid)
            
            params["random_state"] = seed
            # params["num_leaves"] = 500 + seed * 25

            model = lgbm.train(params, 
                              dtrain, 
                            #   num_boost_round=10000,
                              valid_sets=(dtrain, dvalid), 
                              valid_names=("train", "valid"),
                              verbose_eval = 100 # 0
                              )
            pred_valid_p = model.predict(kf_x_valid,num_iteration=model.best_iteration)
            pred_valid=[int(x>=0.5) for x in pred_valid_p]
            
            acc =roc_auc_score(kf_y_valid, pred_valid_p)
            print(f"auc:{acc}")    
            if(acc>best_auc):
                if(best_auc!=0.0):
                    # 混淆矩阵
                    C2= confusion_matrix(kf_y_valid, pred_valid, labels=[0,1])
                    print('\nConfusion Matrix : \n', C2)
#                     fig = sns.heatmap(C2,annot=True,cmap="BuPu",fmt='g',xticklabels=['Abnormal','Normal'],yticklabels=['Abnormal','Normal'])
#                     plt.ylabel('True label')    
#                     plt.xlabel('Predicted label')
#                     scatter_fig = fig.get_figure()
#                     scatter_fig.savefig(f"cm/baseline_figure_model3_{seed + 1}_{f + 1}_{acc}.png", dpi = 400)
#                     plt.close()
                    # f = open(f"modelML/model_{seed + 1}_{f + 1}.pkl", "wb")
                    # pickle.dump(model, f)
                    # f.close()
                    models.append(model)
                    
                best_auc = acc
                print('=' * 80)
    # save_feature_importance(models[-1],'model3')
    return models[-1]

def predict_murmur(features, imputer, classifier1,classifier2):
    '''
    预测murmur
    '''
    # 填充缺失值
    fea = np.array(features['features'])
    test_df = fea #  np.concatenate([fea ], axis=1)

    pre_Unkown = 0.0
    pre_Kown = 0.0
    # 第一分类器进行预测
    pre1 = classifier1.predict(test_df,num_iteration=classifier1.best_iteration)

    # 第二分类器进行预测
    pre2 = classifier2.predict(test_df, num_iteration=classifier2.best_iteration)
    mean1= np.mean(pre1,axis=0)
    mean2 = 0.0 
    if(mean1>=0.5):
        pre_Kown = pre1.max()
        pre_Unkown = 1.0 - pre_Kown
        mean2= np.max(pre2,axis=0)
    else:
        pre_Kown = pre1.min()
        pre_Unkown = 1 - pre_Kown
        mean2= np.min(pre2,axis=0)
    
    # Absent为0，Present为1
    present = pre_Kown * mean2
    absent = pre_Kown * (1.0-mean2)
    # print(f'pre2:{pre2}, mean2:{mean2}, present:{present}, absent:{absent}')
    return [present,pre_Unkown,absent]

def predict_outcome(features, imputer, classifier):
    '''
    预测murmur
    '''
    # 填充缺失值
    fea = np.array(features['features'])
    test_df = fea # np.concatenate([fea ], axis=1)

    # 进行预测
    normal = 0.0
    abnormal = 0.0
    pre1 = classifier.predict(test_df,num_iteration=classifier.best_iteration)
    normal = pre1.min()
    abnormal = 1.0 - normal
    # mean1= np.mean(pre1,axis=0)
    # if(mean1>=0.5):
    #     normal = pre1.max()
    #     abnormal = 1.0 - normal
    # else:
    #     normal = pre1.min()
    #     abnormal = 1.0 - normal
    
    return [abnormal,normal]

def Create_folder(filename):
    '''
    创建文件夹
    '''
    filename = filename.strip()
    filename = filename.rstrip("\\")
    isExists = os.path.exists(filename)

    if not isExists:
        os.makedirs(filename)
        print(filename+"创建成功")
        return  True
    else:
        print(filename+"已存在")
        return False
