import pickle
import numpy as np
import pandas as pd
import lightgbm as lgbm
from sklearn.model_selection import KFold
from imblearn.over_sampling import SVMSMOTE
from sklearn.model_selection import GridSearchCV


def prepare_model1():
    
    '''
    准备分类器1训练数据
    '''
    # features_df = features_dict['features']
    
    features_df = pd.read_csv(f"features/minmax_features.csv")
    
    train_df = features_df.iloc[:,:-1] # 删除outcomes标签 
    # 对标签列重命名
    train_df.rename(columns = {train_df.columns[-1]: "label"},  inplace=True)
    # 标签转换。Unkonwn为0，其他为1, 原始标签为{'Present':0, 'Unknown':1, 'Absent':2}
    train_df['label'] = train_df['label'].apply(lambda x: 0 if x==1 else 1)
    print(train_df['label'].value_counts())
    X, y = train_df.iloc[:, :-1].values, train_df.iloc[:, -1].values
    
    print(X.shape, y.shape)

    smote = SVMSMOTE(sampling_strategy=1, random_state=42)

    return X, y, smote

def prepare_model2():
    '''
    准备分类器2数据, 对存在与不存在进行分类
    '''
    
    # features_df = features_dict['features']
    
    features_df = pd.read_csv(f"features/minmax_features.csv")
    
    train_df =  features_df.iloc[:,:-1] # 删除outcome标签
    # 对标签列重命名
    train_df.rename(columns = {train_df.columns[-1]: "label"},  inplace=True)
    # 标签转换。Absent为0，Present为1, 原始标签为{'Present':0, 'Unknown':1, 'Absent':2}
    train_df = train_df[train_df['label'].isin([0,2])]
    train_df['label'] = train_df['label'].apply(lambda x: 0 if x==2 else 1)
    print(train_df['label'].value_counts())
    X, y = train_df.iloc[:, :-1].values, train_df.iloc[:, -1].values
    print(X.shape, y.shape)

    smote = SVMSMOTE(sampling_strategy=1, random_state=42)
    return X, y, smote

def prepare_model3():
    '''
    准备分类器3数据, 对outcome正常与不正常进行分类
    '''
    # features_df = features_dict['features']
    
    features_df = pd.read_csv(f"features/minmax_features.csv")
    features_df.rename(columns = {features_df.columns[-2]: "murmur", features_df.columns[-1]: "label"},  inplace=True)
    train_df =  features_df.drop(columns=['murmur']) # 删除outcome标签
    # {'Abnormal':0, 'Normal':1}
    print(train_df['label'].value_counts())
    X, y = train_df.iloc[:, :-1].values, train_df.iloc[:, -1].values
    print(X.shape, y.shape)

    smote = SVMSMOTE(sampling_strategy=1, random_state=42)
    return X, y, smote


def train_model1():
    
    X_train, y_train, smote = prepare_model1()
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # 精细化标签参数
    params = {"objective": "binary",
            "boosting_type": "gbdt",
            "metric" : "auc",
            'learning_rate': 0.1, 
            'n_estimators':502,
            "max_depth":7,
            "num_leaves":70,
            'max_bin': 55, 
            'min_data_in_leaf': 51,
            'bagging_fraction': 0.9, 
            'bagging_freq': 60, 
            'feature_fraction': 1.0,
            'lambda_l1': 0.0, 
            'lambda_l2': 0.0,
            'min_split_gain': 0.0,
            "verbose":-1,
            }
    
    # data_train = lgbm.Dataset(X_train, y_train)
    # cv_results = lgbm.cv(params, data_train, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=50,seed=0)
    # print(cv_results)
    # print('best n_estimators:', len(cv_results['auc-mean']))
    # print('best cv score:', pd.Series(cv_results['auc-mean']).max())

    lg = lgbm.LGBMClassifier(**params)
    param_dist = {
        # 'max_depth': range(3,8,1), 'num_leaves':range(5, 100, 5)
        # 'bagging_freq':range(50, 71, 1)
        # 'max_bin': range(5,256,10), 'min_data_in_leaf':range(1,102,10)
        # 'feature_fraction': [0.6,0.7,0.8,0.9,1.0], 'bagging_fraction': [0.6,0.7,0.8,0.9,1.0], 'bagging_freq': range(0,81,10),
        'lambda_l1': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0], 'lambda_l2': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0],
        # 'min_split_gain':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
        # 'learning_rate':[0.1,0.01,0.02,0.03,0.04,0.05,0.06,0.001]
        
                }
    grid_search = GridSearchCV(lg, n_jobs=-1, param_grid=param_dist, cv = 5, scoring="roc_auc", verbose=100)
    grid_search.fit(X_train,y_train)

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best estimator:",grid_search.best_estimator_)
    print("最佳参数:",grid_search.best_params_)
    model = grid_search.best_estimator_
    print('=' * 80)


def train_model2():
    
    X_train, y_train, smote = prepare_model2()
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # 精细化标签
    params = {"objective": "binary",
            "boosting_type": "gbdt",
            "metric" : "auc",
            'learning_rate': 0.1, 
            'n_estimators':2074,
            "max_depth":7,
            "num_leaves":58,
            'max_bin': 15, 
            'min_data_in_leaf': 2,
            'bagging_fraction': 0.9, 
            'bagging_freq': 40, 
            'feature_fraction': 1.0,
            'lambda_l1': 0.0, 
            'lambda_l2': 0.0,
            'min_split_gain': 0.0,
            "verbose":-1,
            }

   
    # data_train = lgbm.Dataset(X_train, y_train)
    # cv_results = lgbm.cv(params, data_train, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=1000,seed=42)
    # print(cv_results)
    # print('best n_estimators:', len(cv_results['auc-mean']))
    # print('best cv score:', pd.Series(cv_results['auc-mean']).max())

    lg = lgbm.LGBMClassifier(**params)
    param_dist = {
        
        # 'max_depth': range(3,8,1), 'num_leaves':range(5, 100, 5)
        # 'bagging_freq':range(30, 51, 1)
        # 'max_bin': range(5,256,10), 'min_data_in_leaf':range(1,102,10)
        # 'feature_fraction': [0.6,0.7,0.8,0.9,1.0], 'bagging_fraction': [0.6,0.7,0.8,0.9,1.0], 'bagging_freq': range(0,81,10),
        'lambda_l1': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0], 'lambda_l2': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0],
        # 'min_split_gain':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
        # 'learning_rate':[0.1,0.01,0.02,0.03,0.04,0.05,0.06,0.001]
        
                }
    grid_search = GridSearchCV(lg, n_jobs=-1, param_grid=param_dist, cv = 5, scoring="roc_auc", verbose=100)
    grid_search.fit(X_train,y_train)

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best estimator:",grid_search.best_estimator_)
    print("最佳参数:",grid_search.best_params_)
    model = grid_search.best_estimator_
    print('=' * 80)

def train_model3():
    
    X_train, y_train, smote = prepare_model3()
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # 精细化标签
    params = {"objective": "binary",
            "boosting_type": "gbdt",
            "metric" : "auc",
            'learning_rate': 0.1, 
            'n_estimators':1055,
            "max_depth":7,
            "num_leaves":55,
            'max_bin': 245, 
            'min_data_in_leaf': 1,
            'bagging_fraction': 0.8, 
            'bagging_freq': 60, 
            'feature_fraction': 0.7,
            'lambda_l1': 0.0, 
            'lambda_l2': 0.0,
            'min_split_gain': 0.0,
            "verbose":-1,
            }

    
    
    # data_train = lgbm.Dataset(X_train, y_train)
    # cv_results = lgbm.cv(params, data_train, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=1000,seed=42)
    # print(cv_results)
    # print('best n_estimators:', len(cv_results['auc-mean']))
    # print('best cv score:', pd.Series(cv_results['auc-mean']).max())

    lg = lgbm.LGBMClassifier(**params)
    param_dist = {
        # 'max_depth': range(3,8,1), 'num_leaves':range(5, 100, 5)
        # 'min_data_in_leaf':range(45, 65, 1)
        # 'max_bin': range(5,256,10), 'min_data_in_leaf':range(1,102,10)
        # 'feature_fraction': [0.6,0.7,0.8,0.9,1.0], 'bagging_fraction': [0.6,0.7,0.8,0.9,1.0], 'bagging_freq': range(0,81,10),
        'lambda_l1': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0], 'lambda_l2': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0],
        # 'min_split_gain':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
        # 'learning_rate':[0.1,0.01,0.02,0.03,0.04,0.05,0.06,0.001]
        
                }
    grid_search = GridSearchCV(lg, n_jobs=-1, param_grid=param_dist, cv = 5, scoring="roc_auc", verbose=100)
    grid_search.fit(X_train,y_train)

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best estimator:",grid_search.best_estimator_)
    print("最佳参数:",grid_search.best_params_)
    model = grid_search.best_estimator_
    print('=' * 80)
   

if __name__ == "__main__":
    # train_model1()
    # train_model2()
    train_model3()
    