import numpy as np
import pandas as pd
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

from lightgbm import LGBMClassifier

tqdm.pandas()


# 데이터 로드
df = pd.read_csv('https://raw.githubusercontent.com/hyeonkeemin/dip_test/main/CarPlatform/sample_car_hacking.csv')

# 데이터 확인
print(f"Row: {df.shape[0]}, Columns: {df.shape[1]}")
df.head()

Row: 80639, Columns: 7


# 클래스 범주별 데이터 수 시각화
plt.subplots(figsize=(8, 6))
sns.countplot(x='DLC', hue='Class', data=df)
plt.show()


# 클래스 범주별 데이터 비율 시각화
plt.subplots(figsize=(8, 6))
sns.histplot(data=df, x='DLC', hue='Class', multiple='fill', stat='probability')
plt.show()


# 8:2로 학습용 데이터, 테스트용 데이터 분리
df, test = train_test_split(df, test_size=0.2, shuffle=False)


# 학습 데이터 예측값 분리
train_y = df['Class']

# 불필요한 칼럼 제거
df = df.drop(['Timestamp', 'Class', 'Unnamed: 0', 'SubClass'], axis=1)

# 테스트 데이터 예측값 분리
test_y = test['Class']

# 불필요한 칼럼 제거
test = test.drop(['Timestamp', 'Class', 'Unnamed: 0', 'SubClass'], axis=1)

# 학습데이터 예측값 범주별 데이터 수 집계
train_y.value_counts()

Normal    58726
Attack     5785
Name: Class, dtype: int64


# DataLength 코드별 CAN 메세지 수 집계
df['DLC'].value_counts()

8    52940
4     4971
6     2641
7     2597
5     1269
2       93
Name: DLC, dtype: int64


# 결측치 확인
df.isna().sum()

Arbitration_ID    0
DLC               0
Data              0
dtype: int64


def Data_col_split(df):
    split = df['Data'].str.split(' ').progress_apply(lambda x: pd.Series(x))
    df = pd.concat([df, split], axis=1)
    df = df.drop('Data', axis=1)
    df = df.rename(columns={
        0: 'Data_1',
        1: 'Data_2',
        2: 'Data_3',
        3: 'Data_4',
        4: 'Data_5',
        5: 'Data_6',
        6: 'Data_7',
        7: 'Data_8',
    })
    return df


# 생성한 함수를 통해 데이터 전처리 진행
df = Data_col_split(df)

# 데이터 확인
df.head()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64511/64511 [00:07<00:00, 8251.52it/s]


# 라벨 인코더 인스턴스 생성 및 데이터 학습
le = LabelEncoder().fit(df['Arbitration_ID'])

# 학습된 잌코더를 바탕으로 데이터 변환
df['Arbitration_ID'] = le.transform(df['Arbitration_ID'])

# 데이터 확인
df.head()


col = ['Data_1', 'Data_2', 'Data_3', 'Data_4', 'Data_5', 'Data_6', 'Data_7', 'Data_8']

for i in col:
    df[i] = df[i].apply(lambda x: int(x, 16) if type(x)==str else x)
    
df[col] = df[col].fillna(-1)


def data_preprocessing(df):
    # CAN 데이터 분할
    split = df['Data'].str.split(' ').progress_apply(lambda x: pd.Series(x))
    df = pd.concat([df, split], axis=1)
    df.drop('Data', axis=1, inplace=True)
    df.rename(columns={
        0: 'Data_1',
        1: 'Data_2',
        2: 'Data_3',
        3: 'Data_4',
        4: 'Data_5',
        5: 'Data_6',
        6: 'Data_7',
        7: 'Data_8',
    }, inplace=True)
    
    # 레이블 인코딩
    df['Arbitration_ID'] = le.transform(df['Arbitration_ID'])
    
    # CAN 데이터의 16진수 변환
    for i in col:
        df[i] = df[i].apply(lambda x: int(x, 16) if type(x)==str else x)
    
    # 결측값 처리
    df[col] = df[col].fillna(-1)
    
    return df


df.head()


# 데이터 확인
print(f"Row: {df.shape[0]}, Columns: {df.shape[1]}")
df.head()

Row: 64511, Columns: 10


# 데이터 증폭기 생성
smote_tomek = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))

# 데이터 증폭기를 활용한 데이터 증폭
df_smt, y_smt = smote_tomek.fit_resample(df, train_y)


# 데이터 확인
print(f"Row: {df_smt.shape[0]}, Columns: {df_smt.shape[1]}")
df_smt.head()

Row: 117355, Columns: 10


train_y.value_counts()

Normal    58726
Attack     5785
Name: Class, dtype: int64


y_smt.value_counts()

Normal    58726
Attack    58629
Name: Class, dtype: int64


# 분류기 생성
clf = LGBMClassifier(boosting_type='dart', random_state=0)

# 분류기 학습
clf.fit(df_smt, y_smt)

[LightGBM] [Info] Number of positive: 58726, number of negative: 58629
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000888 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2063
[LightGBM] [Info] Number of data points in the train set: 117355, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500413 -> initscore=0.001653
[LightGBM] [Info] Start training from score 0.001653

LGBMClassifier(boosting_type='dart', random_state=0)


# 사전에 정의한 함수를 통한 테스트 데이터 전처리
test = data_preprocessing(test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16128/16128 [00:01<00:00, 8183.39it/s]


# 학습된 모델을 화룡하여 예측
pred = clf.predict(test)


# 라벨별 데이터 수 집계
pd.DataFrame(pred).value_counts()

Normal    14905
Attack     1223
dtype: int64


# 분류 결과 평가
print(classification_report(test_y, pred))

              precision    recall  f1-score   support

      Attack       0.95      0.83      0.88      1403
      Normal       0.98      1.00      0.99     14725

    accuracy                           0.98     16128
   macro avg       0.97      0.91      0.94     16128
weighted avg       0.98      0.98      0.98     16128

	Arbitration_ID	DLC	Data_1	Data_2	Data_3	Data_4	Data_5	Data_6	Data_7	Data_8
0	31	5	0	136	139.0	0.0	193.0	-1.0	-1.0	-1.0
1	8	8	244	3	39.0	23.0	0.0	247.0	7.0	128.0
2	34	8	4	245	255.0	255.0	0.0	123.0	0.0	0.0
3	23	8	0	64	8.0	0.0	255.0	211.0	157.0	252.0
4	9	8	5	52	2.0	48.0	0.0	188.0	89.0	40.0

	Arbitration_ID	DLC	Data_1	Data_2	Data_3	Data_4	Data_5	Data_6	Data_7	Data_8
0	31	5	0	136	139.0	0.0	193.0	-1.0	-1.0	-1.0
1	8	8	244	3	39.0	23.0	0.0	247.0	7.0	128.0
2	34	8	4	245	255.0	255.0	0.0	123.0	0.0	0.0
3	23	8	0	64	8.0	0.0	255.0	211.0	157.0	252.0
4	9	8	5	52	2.0	48.0	0.0	188.0	89.0	40.0

	Arbitration_ID	DLC	Data_1	Data_2	Data_3	Data_4	Data_5	Data_6	Data_7	Data_8
0	31	5	0	136	139.0	0.0	193.0	-1.0	-1.0	-1.0
1	8	8	244	3	39.0	23.0	0.0	247.0	7.0	128.0
2	34	8	4	245	255.0	255.0	0.0	123.0	0.0	0.0
3	23	8	0	64	8.0	0.0	255.0	211.0	157.0	252.0
4	9	8	5	52	2.0	48.0	0.0	188.0	89.0	40.0

티스토리

LightGBM을 활용한 이상탐지(Anomaly detection)

LightGBM을 활용한 이상탐지(Anomaly detection)

LightGBM을 활용한 지도학습 기반의 이상 데이터 탐지 모델 개발¶

1. 분석 배경 및 목적 설명¶

1) 배경¶

2) 이상 탐지¶

3) 데이터셋 구성¶

2. 데이터 수집 및 전처리¶

1) 패키지 로드 및 데이터 확인¶

2) Train/Test Split¶

3) 데이터 전처리 및 파생변수 생성¶

4) 불균형 데이터의 샘플링¶

Tomek's link¶

SMOTE(Synthetic Minority Over-sampling Technique)¶

ADASYN(Adaptive Synthetic Sampling Approach)¶

SMOTE-Tomek¶

3. 모델링¶

4. 모델 평가¶

참고문헌¶

	Unnamed: 0	Timestamp	Arbitration_ID	DLC	Data	Class	SubClass
0	9	1.597760e+09	453	5	00 88 8B 00 C1	Normal	Normal
1	17	1.597760e+09	251	8	F4 03 27 17 00 F7 07 80	Normal	Normal
2	21	1.597760e+09	47F	8	04 F5 FF FF 00 7B 00 00	Normal	Normal
3	36	1.597760e+09	394	8	00 40 08 00 FF D3 9D FC	Normal	Normal
4	42	1.597760e+09	260	8	05 34 02 30 00 BC 59 28	Normal	Normal

	Arbitration_ID	DLC	Data_1	Data_2	Data_3	Data_4	Data_5	Data_6	Data_7	Data_8
0	453	5	00	88	8B	00	C1	NaN	NaN	NaN
1	251	8	F4	03	27	17	00	F7	07	80
2	47F	8	04	F5	FF	FF	00	7B	00	00
3	394	8	00	40	08	00	FF	D3	9D	FC
4	260	8	05	34	02	30	00	BC	59	28

	Arbitration_ID	DLC	Data_1	Data_2	Data_3	Data_4	Data_5	Data_6	Data_7	Data_8
0	31	5	00	88	8B	00	C1	NaN	NaN	NaN
1	8	8	F4	03	27	17	00	F7	07	80
2	34	8	04	F5	FF	FF	00	7B	00	00
3	23	8	00	40	08	00	FF	D3	9D	FC
4	9	8	05	34	02	30	00	BC	59	28

	Arbitration_ID	DLC	Data_1	Data_2	Data_3	Data_4	Data_5	Data_6	Data_7	Data_8
0	453	5	00	88	8B	00	C1	NaN	NaN	NaN
1	251	8	F4	03	27	17	00	F7	07	80
2	47F	8	04	F5	FF	FF	00	7B	00	00
3	394	8	00	40	08	00	FF	D3	9D	FC
4	260	8	05	34	02	30	00	BC	59	28

	Arbitration_ID	DLC	Data_1	Data_2	Data_3	Data_4	Data_5	Data_6	Data_7	Data_8
0	31	5	00	88	8B	00	C1	NaN	NaN	NaN
1	8	8	F4	03	27	17	00	F7	07	80
2	34	8	04	F5	FF	FF	00	7B	00	00
3	23	8	00	40	08	00	FF	D3	9D	FC
4	9	8	05	34	02	30	00	BC	59	28

	Arbitration_ID	DLC	Data_1	Data_2	Data_3	Data_4	Data_5	Data_6	Data_7	Data_8
0	453	5	00	88	8B	00	C1	NaN	NaN	NaN
1	251	8	F4	03	27	17	00	F7	07	80
2	47F	8	04	F5	FF	FF	00	7B	00	00
3	394	8	00	40	08	00	FF	D3	9D	FC
4	260	8	05	34	02	30	00	BC	59	28

	Arbitration_ID	DLC	Data_1	Data_2	Data_3	Data_4	Data_5	Data_6	Data_7	Data_8
0	31	5	00	88	8B	00	C1	NaN	NaN	NaN
1	8	8	F4	03	27	17	00	F7	07	80
2	34	8	04	F5	FF	FF	00	7B	00	00
3	23	8	00	40	08	00	FF	D3	9D	FC
4	9	8	05	34	02	30	00	BC	59	28