본문 바로가기
DL

[PUBG] ML_baseline(lightgbm)

by YGSEO 2020. 6. 2.
728x90
ML_Baseline

Summary from outlier

  1. headshotrate > 0.5
  2. damagedealt >= 4000
  3. kills > 60
  4. killstreak > 10
  5. walkdistance > 7.5k
  6. weaponacquired > 20
  7. heals > 40
  8. boosts > 20
  9. totaldistance == 0 & kills > 0
  10. walkdistance == 0 & kills > 0
  11. ridedistance == 0 & roadkills > 0
  12. weaponacquired == 0 & winplaceperc > 0.5
  13. heals == 0 & winplaceperc > 0.8
  14. heals and boosts == 0 & winplaceperc > 0.8
  15. one NaN in target value
In [2]:
import os, time, gc
import pandas as pd, numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
os.listdir('input')
Out[3]:
['test_V2.csv', 'sample_submission_V2.csv', 'train_V2.csv']
In [4]:
%%time
tr = pd.read_csv("input/train_V2.csv")
te = pd.read_csv("input/test_V2.csv")
CPU times: user 11.3 s, sys: 706 ms, total: 12 s
Wall time: 12.1 s
In [5]:
def missing_values_table(df):# Function to calculate missing values by column# Funct 
    mis_val = df.isnull().sum() # Total missing values
    mis_val_pct = 100 * df.isnull().sum() / len(df)# Percentage of missing values
    mis_val_df = pd.concat([mis_val, mis_val_pct], axis=1)# Make a table with the results
    mis_val_df_cols = mis_val_df.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})# Rename the columns
    mis_val_df_cols = mis_val_df_cols[mis_val_df_cols.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)# Sort the table by percentage of missing descending
    print ("Dataframe has " + str(df.shape[1]) + " columns.\n" 
           "There are " + str(mis_val_df_cols.shape[0]) + " cols having missing values.")# Print some summary information
    return mis_val_df_cols # Return the dataframe with missing information
In [6]:
missing_values_table(tr)
Dataframe has 29 columns.
There are 1 cols having missing values.
Out[6]:
Missing Values % of Total Values
winPlacePerc 1 0.0
In [7]:
missing_values_table(te)
Dataframe has 28 columns.
There are 0 cols having missing values.
Out[7]:
Missing Values % of Total Values
In [8]:
tr[tr['winPlacePerc'].isnull()]
Out[8]:
Id groupId matchId assists boosts damageDealt DBNOs headshotKills heals killPlace ... revives rideDistance roadKills swimDistance teamKills vehicleDestroys walkDistance weaponsAcquired winPoints winPlacePerc
2744604 f70c74418bb064 12dfbede33f92b 224a123c53e008 0 0 0.0 0 0 0 1 ... 0 0.0 0 0.0 0 0 0.0 0 0 NaN

1 rows × 29 columns

In [9]:
tr.drop(2744604, inplace=True)
In [10]:
def base(df):
    df['headshot_rate'] = df['headshotKills']/df['kills']
    df['total_distance'] = df[df.filter(regex='Dist').columns.tolist()].sum(axis=1)
    df['heals_n_boosts'] = df[['heals','boosts']].sum(axis=1)
    return df
In [11]:
tr = base(tr)
te = base(te)
In [12]:
tr['is_outlier'] = np.where(tr['headshot_rate']> 0.5, 1, 0)
tr['is_outlier'] = np.where(tr['damageDealt']>=4000, 1, 0)
tr['is_outlier'] = np.where(tr['kills']> 60, 1, 0)
tr['is_outlier'] = np.where(tr['killStreaks']> 10, 1, 0)
tr['is_outlier'] = np.where(tr['walkDistance']> 7500, 1, 0)
tr['is_outlier'] = np.where(tr['weaponsAcquired']> 20, 1, 0)
tr['is_outlier'] = np.where(tr['heals']> 40, 1, 0)
tr['is_outlier'] = np.where(tr['boosts']> 20, 1, 0)
tr['is_outlier'] = np.where((tr['total_distance']==0)&(tr['kills']!=0), 1, 0)
tr['is_outlier'] = np.where((tr['walkDistance']==0)&(tr['kills']!=0), 1, 0)
tr['is_outlier'] = np.where((tr['rideDistance']==0)&(tr['roadKills']!=0), 1, 0)
tr['is_outlier'] = np.where((tr['weaponsAcquired']==0)&(tr['winPlacePerc']>0.5), 1, 0)
tr['is_outlier'] = np.where((tr['heals']==0)&(tr['winPlacePerc']>0.8), 1, 0)
tr['is_outlier'] = np.where((tr['heals_n_boosts']==0)&(tr['winPlacePerc']>0.8), 1, 0)
In [13]:
print("The total number of outliers")
print("n = {}".format(tr['is_outlier'].sum()))
print("% = {}".format(tr['is_outlier'].sum()/tr.shape[0]))
The total number of outliers
n = 81343
% = 0.01829180126220917

Build Model

Train model w/o outliers

In [14]:
print(tr.shape)
tr = tr[tr['is_outlier']==0]
print(tr.shape)
(4446965, 33)
(4365622, 33)
In [15]:
# exclude target and outlier columns
base_feat = tr.columns[3:-6].tolist()+tr.columns[29:-1].tolist()
train_Y = tr['winPlacePerc']

Make object type to category for lightgbm model to take.

In [16]:
tr[base_feat].dtypes.value_counts()
Out[16]:
int64      19
float64     7
object      1
dtype: int64
In [17]:
tr[base_feat].select_dtypes(include=['object']).columns
Out[17]:
Index(['matchType'], dtype='object')
In [18]:
tr['matchType'] = tr['matchType'].astype('category')
te['matchType'] = te['matchType'].astype('category')

Now object type changed to category

In [19]:
tr[base_feat].dtypes.value_counts()
Out[19]:
int64       19
float64      7
category     1
dtype: int64
In [20]:
import time
from sklearn import metrics
from operator import itemgetter
import lightgbm as lgb
In [21]:
from sklearn.model_selection import train_test_split
def LGB_HOLDOUT_REG(size_ratio, train_X, test_X, metric, is_shuffle):
    X_tr,X_val,y_tr,y_val = train_test_split(train_X, train_Y, train_size = size_ratio, shuffle=is_shuffle)

    val_lgb = np.zeros(len(X_val))
    predictions = np.zeros(len(test_X))
    
    # Model parameters
    lgb_params = {'num_leaves': 31,
                 'min_data_in_leaf': 20, 
                 'objective':'regression',
                 'max_depth': -1,
                 'learning_rate': 0.1,
                 "boosting": "gbdt",
                 "feature_fraction": 1,
                 "bagging_freq": 1,
                 "bagging_fraction": 1,
                 "bagging_seed": 42,
                 "metric": metric,
                 "lambda_l1": 0.0,
                 "verbosity": 1000,
                 "nthread": -1,
                 "random_state": 42}

    model_start = time.time()

    model = lgb.LGBMRegressor(**lgb_params, n_estimators = 20000, n_jobs = -1)
    model.fit(X_tr, 
              y_tr, 
              eval_set=[(X_tr, y_tr), (X_val, y_val)], 
              eval_metric=metric,
              verbose=1000, 
              early_stopping_rounds=200)
    val_lgb = model.predict(X_val, num_iteration=model.best_iteration_)

    cv_score = model.best_score_

    #feature importance
    feature_importance_df = pd.DataFrame()
    feature_importance_df["Feature"] = train_X.columns
    feature_importance_df["importance"] = model.feature_importances_[:len(train_X.columns)]

    #predictions
    predictions = model.predict(test_X, num_iteration=model.best_iteration_)

    print("-" * 50)
    print("HOLD_OUT "+ metric + " = {}".format(cv_score))
    lgb.plot_metric(model, metric=metric, title='auc plot', xlabel='Iterations', ylabel='auto', figsize=(10,8), grid=False)

    model_end = time.time()
    model_elapsed = model_end - model_start
    print('Model elapsed {0:0.2f}'.format(model_elapsed/60), "minutes.")

    # FEATURE IMPORTANCE
    pd.set_option('display.max_rows', 500)
    feature_importance_df['Feature Rank'] = feature_importance_df['importance'].rank(ascending=0)
    feature_importance_df = feature_importance_df.sort_values('Feature Rank', ascending = True)
    print(feature_importance_df.loc[feature_importance_df['importance']!=0].head(100))

    return predictions, val_lgb, cv_score, feature_importance_df
In [22]:
pred, val, cv, feat = LGB_HOLDOUT_REG(0.8, tr[base_feat], te[base_feat], 'l1', is_shuffle=True)
Training until validation scores don't improve for 200 rounds
[1000]	training's l1: 0.0546375	valid_1's l1: 0.0551949
[2000]	training's l1: 0.0537246	valid_1's l1: 0.0546378
[3000]	training's l1: 0.0530826	valid_1's l1: 0.0543411
[4000]	training's l1: 0.0525343	valid_1's l1: 0.0541235
[5000]	training's l1: 0.0520637	valid_1's l1: 0.0539715
[6000]	training's l1: 0.0516189	valid_1's l1: 0.0538396
[7000]	training's l1: 0.051173	valid_1's l1: 0.053685
[8000]	training's l1: 0.0507505	valid_1's l1: 0.0535569
[9000]	training's l1: 0.0503807	valid_1's l1: 0.0534782
[10000]	training's l1: 0.050026	valid_1's l1: 0.0534142
[11000]	training's l1: 0.0496732	valid_1's l1: 0.0533334
[12000]	training's l1: 0.049352	valid_1's l1: 0.0532831
[13000]	training's l1: 0.049041	valid_1's l1: 0.0532289
[14000]	training's l1: 0.0487191	valid_1's l1: 0.053175
[15000]	training's l1: 0.0484063	valid_1's l1: 0.0531278
[16000]	training's l1: 0.0481076	valid_1's l1: 0.0530745
[17000]	training's l1: 0.0478092	valid_1's l1: 0.0530234
[18000]	training's l1: 0.0475326	valid_1's l1: 0.0529852
[19000]	training's l1: 0.0472439	valid_1's l1: 0.0529332
[20000]	training's l1: 0.0469709	valid_1's l1: 0.0528894
Did not meet early stopping. Best iteration is:
[20000]	training's l1: 0.0469709	valid_1's l1: 0.0528894
--------------------------------------------------
HOLD_OUT l1 = defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('l1', 0.046970920374816025)]), 'valid_1': OrderedDict([('l1', 0.05288943910456932)])})
Model elapsed 32.89 minutes.
            Feature  importance  Feature Rank
11    matchDuration       73986           1.0
6         killPlace       56948           2.0
22     walkDistance       52129           3.0
2       damageDealt       44870           4.0
10      longestKill       43943           5.0
25   total_distance       37971           6.0
14        numGroups       37328           7.0
13         maxPlace       35812           8.0
15       rankPoints       34427           9.0
7        killPoints       24704          10.0
17     rideDistance       20777          11.0
23  weaponsAcquired       20013          12.0
26   heals_n_boosts       17158          13.0
12        matchType       13759          14.0
24    headshot_rate       13527          15.0
5             heals       13297          16.0
1            boosts       11915          17.0
3             DBNOs        9653          18.0
8             kills        8789          19.0
19     swimDistance        7401          20.0
0           assists        7071          21.0
16          revives        4741          22.0
9       killStreaks        4212          23.0
4     headshotKills        3197          24.0
20        teamKills        1354          25.0
21  vehicleDestroys         573          26.0
18        roadKills         445          27.0

Feature Importance

In [25]:
plt.figure(figsize=(20, 10))
sns.barplot(x="importance", y="Feature", data=feat)
plt.title('LightGBM Features')
plt.tight_layout()
plt.show()
728x90

'DL' 카테고리의 다른 글

[Git repo] from "git clone" to "merge"  (0) 2021.01.31
Pytorch Dataset - cv2.imread 메모리 사용  (0) 2021.01.31
[PUBG] Detecting Outliers  (0) 2020.06.02
[PUBG] EDA  (0) 2020.05.29
[tabnet] beating tablet data with deep learning  (0) 2020.05.28

댓글