Summary from outlier¶

headshotrate > 0.5
damagedealt >= 4000
kills > 60
killstreak > 10
walkdistance > 7.5k
weaponacquired > 20
heals > 40
boosts > 20
totaldistance == 0 & kills > 0
walkdistance == 0 & kills > 0
ridedistance == 0 & roadkills > 0
weaponacquired == 0 & winplaceperc > 0.5
heals == 0 & winplaceperc > 0.8
heals and boosts == 0 & winplaceperc > 0.8
one NaN in target value

import os, time, gc
import pandas as pd, numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

os.listdir('input')

['test_V2.csv', 'sample_submission_V2.csv', 'train_V2.csv']

%%time
tr = pd.read_csv("input/train_V2.csv")
te = pd.read_csv("input/test_V2.csv")

CPU times: user 11.3 s, sys: 706 ms, total: 12 s
Wall time: 12.1 s

def missing_values_table(df):# Function to calculate missing values by column# Funct 
    mis_val = df.isnull().sum() # Total missing values
    mis_val_pct = 100 * df.isnull().sum() / len(df)# Percentage of missing values
    mis_val_df = pd.concat([mis_val, mis_val_pct], axis=1)# Make a table with the results
    mis_val_df_cols = mis_val_df.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})# Rename the columns
    mis_val_df_cols = mis_val_df_cols[mis_val_df_cols.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)# Sort the table by percentage of missing descending
    print ("Dataframe has " + str(df.shape[1]) + " columns.\n" 
           "There are " + str(mis_val_df_cols.shape[0]) + " cols having missing values.")# Print some summary information
    return mis_val_df_cols # Return the dataframe with missing information

missing_values_table(tr)

Dataframe has 29 columns.
There are 1 cols having missing values.

missing_values_table(te)

Dataframe has 28 columns.
There are 0 cols having missing values.

tr[tr['winPlacePerc'].isnull()]

tr.drop(2744604, inplace=True)

def base(df):
    df['headshot_rate'] = df['headshotKills']/df['kills']
    df['total_distance'] = df[df.filter(regex='Dist').columns.tolist()].sum(axis=1)
    df['heals_n_boosts'] = df[['heals','boosts']].sum(axis=1)
    return df

tr = base(tr)
te = base(te)

tr['is_outlier'] = np.where(tr['headshot_rate']> 0.5, 1, 0)
tr['is_outlier'] = np.where(tr['damageDealt']>=4000, 1, 0)
tr['is_outlier'] = np.where(tr['kills']> 60, 1, 0)
tr['is_outlier'] = np.where(tr['killStreaks']> 10, 1, 0)
tr['is_outlier'] = np.where(tr['walkDistance']> 7500, 1, 0)
tr['is_outlier'] = np.where(tr['weaponsAcquired']> 20, 1, 0)
tr['is_outlier'] = np.where(tr['heals']> 40, 1, 0)
tr['is_outlier'] = np.where(tr['boosts']> 20, 1, 0)
tr['is_outlier'] = np.where((tr['total_distance']==0)&(tr['kills']!=0), 1, 0)
tr['is_outlier'] = np.where((tr['walkDistance']==0)&(tr['kills']!=0), 1, 0)
tr['is_outlier'] = np.where((tr['rideDistance']==0)&(tr['roadKills']!=0), 1, 0)
tr['is_outlier'] = np.where((tr['weaponsAcquired']==0)&(tr['winPlacePerc']>0.5), 1, 0)
tr['is_outlier'] = np.where((tr['heals']==0)&(tr['winPlacePerc']>0.8), 1, 0)
tr['is_outlier'] = np.where((tr['heals_n_boosts']==0)&(tr['winPlacePerc']>0.8), 1, 0)

print("The total number of outliers")
print("n = {}".format(tr['is_outlier'].sum()))
print("% = {}".format(tr['is_outlier'].sum()/tr.shape[0]))

The total number of outliers
n = 81343
% = 0.01829180126220917

Build Model¶

Train model w/o outliers

print(tr.shape)
tr = tr[tr['is_outlier']==0]
print(tr.shape)

(4446965, 33)
(4365622, 33)

# exclude target and outlier columns
base_feat = tr.columns[3:-6].tolist()+tr.columns[29:-1].tolist()
train_Y = tr['winPlacePerc']

Make object type to category for lightgbm model to take.

tr[base_feat].dtypes.value_counts()

int64      19
float64     7
object      1
dtype: int64

tr[base_feat].select_dtypes(include=['object']).columns

Index(['matchType'], dtype='object')

tr['matchType'] = tr['matchType'].astype('category')
te['matchType'] = te['matchType'].astype('category')

Now object type changed to category

tr[base_feat].dtypes.value_counts()

int64       19
float64      7
category     1
dtype: int64

import time
from sklearn import metrics
from operator import itemgetter
import lightgbm as lgb

from sklearn.model_selection import train_test_split
def LGB_HOLDOUT_REG(size_ratio, train_X, test_X, metric, is_shuffle):
    X_tr,X_val,y_tr,y_val = train_test_split(train_X, train_Y, train_size = size_ratio, shuffle=is_shuffle)

    val_lgb = np.zeros(len(X_val))
    predictions = np.zeros(len(test_X))
    
    # Model parameters
    lgb_params = {'num_leaves': 31,
                 'min_data_in_leaf': 20, 
                 'objective':'regression',
                 'max_depth': -1,
                 'learning_rate': 0.1,
                 "boosting": "gbdt",
                 "feature_fraction": 1,
                 "bagging_freq": 1,
                 "bagging_fraction": 1,
                 "bagging_seed": 42,
                 "metric": metric,
                 "lambda_l1": 0.0,
                 "verbosity": 1000,
                 "nthread": -1,
                 "random_state": 42}

    model_start = time.time()

    model = lgb.LGBMRegressor(**lgb_params, n_estimators = 20000, n_jobs = -1)
    model.fit(X_tr, 
              y_tr, 
              eval_set=[(X_tr, y_tr), (X_val, y_val)], 
              eval_metric=metric,
              verbose=1000, 
              early_stopping_rounds=200)
    val_lgb = model.predict(X_val, num_iteration=model.best_iteration_)

    cv_score = model.best_score_

    #feature importance
    feature_importance_df = pd.DataFrame()
    feature_importance_df["Feature"] = train_X.columns
    feature_importance_df["importance"] = model.feature_importances_[:len(train_X.columns)]

    #predictions
    predictions = model.predict(test_X, num_iteration=model.best_iteration_)

    print("-" * 50)
    print("HOLD_OUT "+ metric + " = {}".format(cv_score))
    lgb.plot_metric(model, metric=metric, title='auc plot', xlabel='Iterations', ylabel='auto', figsize=(10,8), grid=False)

    model_end = time.time()
    model_elapsed = model_end - model_start
    print('Model elapsed {0:0.2f}'.format(model_elapsed/60), "minutes.")

    # FEATURE IMPORTANCE
    pd.set_option('display.max_rows', 500)
    feature_importance_df['Feature Rank'] = feature_importance_df['importance'].rank(ascending=0)
    feature_importance_df = feature_importance_df.sort_values('Feature Rank', ascending = True)
    print(feature_importance_df.loc[feature_importance_df['importance']!=0].head(100))

    return predictions, val_lgb, cv_score, feature_importance_df

pred, val, cv, feat = LGB_HOLDOUT_REG(0.8, tr[base_feat], te[base_feat], 'l1', is_shuffle=True)

Training until validation scores don't improve for 200 rounds
[1000]	training's l1: 0.0546375	valid_1's l1: 0.0551949
[2000]	training's l1: 0.0537246	valid_1's l1: 0.0546378
[3000]	training's l1: 0.0530826	valid_1's l1: 0.0543411
[4000]	training's l1: 0.0525343	valid_1's l1: 0.0541235
[5000]	training's l1: 0.0520637	valid_1's l1: 0.0539715
[6000]	training's l1: 0.0516189	valid_1's l1: 0.0538396
[7000]	training's l1: 0.051173	valid_1's l1: 0.053685
[8000]	training's l1: 0.0507505	valid_1's l1: 0.0535569
[9000]	training's l1: 0.0503807	valid_1's l1: 0.0534782
[10000]	training's l1: 0.050026	valid_1's l1: 0.0534142
[11000]	training's l1: 0.0496732	valid_1's l1: 0.0533334
[12000]	training's l1: 0.049352	valid_1's l1: 0.0532831
[13000]	training's l1: 0.049041	valid_1's l1: 0.0532289
[14000]	training's l1: 0.0487191	valid_1's l1: 0.053175
[15000]	training's l1: 0.0484063	valid_1's l1: 0.0531278
[16000]	training's l1: 0.0481076	valid_1's l1: 0.0530745
[17000]	training's l1: 0.0478092	valid_1's l1: 0.0530234
[18000]	training's l1: 0.0475326	valid_1's l1: 0.0529852
[19000]	training's l1: 0.0472439	valid_1's l1: 0.0529332
[20000]	training's l1: 0.0469709	valid_1's l1: 0.0528894
Did not meet early stopping. Best iteration is:
[20000]	training's l1: 0.0469709	valid_1's l1: 0.0528894
--------------------------------------------------
HOLD_OUT l1 = defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('l1', 0.046970920374816025)]), 'valid_1': OrderedDict([('l1', 0.05288943910456932)])})
Model elapsed 32.89 minutes.
            Feature  importance  Feature Rank
11    matchDuration       73986           1.0
6         killPlace       56948           2.0
22     walkDistance       52129           3.0
2       damageDealt       44870           4.0
10      longestKill       43943           5.0
25   total_distance       37971           6.0
14        numGroups       37328           7.0
13         maxPlace       35812           8.0
15       rankPoints       34427           9.0
7        killPoints       24704          10.0
17     rideDistance       20777          11.0
23  weaponsAcquired       20013          12.0
26   heals_n_boosts       17158          13.0
12        matchType       13759          14.0
24    headshot_rate       13527          15.0
5             heals       13297          16.0
1            boosts       11915          17.0
3             DBNOs        9653          18.0
8             kills        8789          19.0
19     swimDistance        7401          20.0
0           assists        7071          21.0
16          revives        4741          22.0
9       killStreaks        4212          23.0
4     headshotKills        3197          24.0
20        teamKills        1354          25.0
21  vehicleDestroys         573          26.0
18        roadKills         445          27.0

Feature Importance

plt.figure(figsize=(20, 10))
sns.barplot(x="importance", y="Feature", data=feat)
plt.title('LightGBM Features')
plt.tight_layout()
plt.show()

[Git repo] from "git clone" to "merge" (0)	2021.01.31
Pytorch Dataset - cv2.imread 메모리 사용 (0)	2021.01.31
[PUBG] Detecting Outliers (0)	2020.06.02
[PUBG] EDA (0)	2020.05.29
[tabnet] beating tablet data with deep learning (0)	2020.05.28

YGSEO

[PUBG] ML_baseline(lightgbm)

Summary from outlier¶

Build Model¶

'DL' 카테고리의 다른 글

댓글

티스토리툴바

[PUBG] ML_baseline(lightgbm)

Summary from outlier¶

Build Model¶

'DL' 카테고리의 다른 글

관련글

댓글

티스토리툴바