0. Deep FM 논문 설명
https://huidea.tistory.com/279
이전에 살펴본 광고 추천 알고리즘 CTR 논문 중 DeepFM 모델을 구현한 코드이다.
사실 CTR task의 모델들은 deepctr-torch 의 api로 정말 쉽게 ! 구현할 수 있다. (tensor 기반의 deepctr도 있다)
1. 활용 패키지 & 데이터
활용 패키지 : DeepCTR-Torch
https://github.com/shenweichen/DeepCTR-Torch
https://deepctr-torch.readthedocs.io/en/v0.2.4/
활용 데이터 : Avazu dataset
https://www.kaggle.com/c/avazu-ctr-prediction
- 전체 데이터 크기가 너무 커서 구글 빅쿼리에서 10만개만 sampling 했다. (덕분에 오천만년만에 sql 썼다)
2. 코드
- "Reverse Engineering - for studying" 아래 부분은 deepctr 의 deepfm 코드를 직접 뜯어본거다.
- 공부용으로 적어둔 코드니 참고만 하시고 코드 실행은 굳이 할 필요없다. 학습과정은 그 위에서 다 끝난다.
- 같은 코드 올려둔 깃헙 주소 : https://github.com/SeohuiPark/MLDLstudy/blob/main/Recommendation/deepfm_avazudata_10.ipynb
3. 결과 해석 + 느낀점
- 성능이 썩그리 좋지 못하다.
- 아무래도 overfitting이 된거 같아서, batch도 늘리고 dropout 비율도 늘렸는데... test data 에서 0.72 이상은 안오른다.
- 우선 임베딩 하지 않는 numeric feature 개수가 학습 성능 개선에 영향을 미치는 듯하다.
이전에 date를 categorical value로 두고 label encoding 후 embedding 했을 때보다,
아래 코드처럼 date를 numeric value로 두고 scaling 하는게 더 성능이 좋다.
- 전체 데이터 40,428,967 - 40만 - colab에서 불러오기 안됨
- 10만개만 샘플링한 후 load
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:150% !important; }</style>"))
! pip install deepctr-torch
import os
import gzip
import shutil
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import torch
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *
def data_load():
print("\n\n1. data load ")
data_path = "/content/drive/MyDrive/Colab Notebooks/2022_recom_study/ctr_sample_dataset/abazu_dataset/"
data = pd.read_csv(data_path + "avazu_sample_10.csv")
display(data.head(3))
print(data.columns)
print(data.shape)
return data
def feature_selection(data):
print("\n\n2. feature selection ")
sparse_features = data.columns.tolist()
sparse_features.remove('click')
sparse_features.remove('hour')
dense_features = ['hour']
print("sparse feature :", sparse_features)
print("dense feature :", dense_features)
print("target :", 'click')
return data, sparse_features, dense_features
def feature_encoding(data, sparse_features, dense_features):
print("\n\n3-1. feature encoding ")
print("categorical value to numeric label")
for feat in sparse_features:
lbe = LabelEncoder()
data[feat] = lbe.fit_transform(data[feat])
print("numeric value Minmax scaling ")
mms = MinMaxScaler(feature_range=(0, 1)) ### date 더 최근일 수록 더 큰 숫자가 입력됨
data[dense_features] = mms.fit_transform(data[dense_features])
return data
def feature_format_deepfm(data, sparse_features, dense_features, embedding_dim):
print(f"\n\n3-2. feature embedding - embedding size {embedding_dim}")
spar_feat_list = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=embedding_dim) for i, feat in enumerate(sparse_features)]
dense_feat_list = [DenseFeat(feat, 1, ) for feat in dense_features]
fixlen_feature_columns = spar_feat_list + dense_feat_list
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
return dnn_feature_columns, linear_feature_columns, feature_names
def data_split(data, test_rato, feature_names, random_seed):
print(f"\n\n4. data split (test ratio - {test_rato})")
train, test = train_test_split(data, test_size=test_rato, random_state = random_seed)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}
return train, test, train_model_input, test_model_input
def modeling(linear_feature_columns, dnn_feature_columns,
batch_size, num_epoch, val_ratio, test_rato, l2_decay_val, random_seed):
print(f"\n\n5. Modeling")
model = DeepFM(linear_feature_columns=linear_feature_columns,
dnn_feature_columns=dnn_feature_columns,
l2_reg_linear=l2_decay_val, l2_reg_embedding=l2_decay_val, l2_reg_dnn=l2_decay_val,
dnn_dropout=0.5,
dnn_use_bn = True,
dnn_hidden_units=(32, 16),
task='binary',
seed=random_seed, device=device)
model.compile("adam", "binary_crossentropy",
metrics=["binary_crossentropy", "auc"], )
return model
def eval_test(model, test_model_input, batch_size ):
print(f"\n\n6. Evaluation testset")
pred_ans = model.predict(test_model_input, batch_size) #batch_size default : 256
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
4. modeling¶
if __name__ == "__main__":
batch_size = 1000
num_epoch = 20
val_ratio = 0.1
test_rato = 0.1
random_seed = 2022
l2_decay_val = 1e-01
embedding_dim = 5
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
print('cuda ready...')
device = 'cuda:0'
data = data_load()
target = ['click']
data, sparse_features, dense_features = feature_selection(data)
data = feature_encoding(data, sparse_features, dense_features)
dnn_feature_columns, linear_feature_columns, feature_names = feature_format_deepfm(data, sparse_features, dense_features, embedding_dim)
train, test, train_model_input, test_model_input = data_split(data, test_rato,
feature_names, random_seed)
model = modeling(linear_feature_columns, dnn_feature_columns,
batch_size, num_epoch, val_ratio, test_rato, l2_decay_val, random_seed)
model.fit(train_model_input, train[target].values,
batch_size=batch_size, epochs=num_epoch, verbose=2, validation_split=val_ratio)
eval_test(model, test_model_input, batch_size)
Reverse Engineering - for studying¶
class FM(nn.Module):
"""Factorization Machine models pairwise (order-2) feature interactions
without linear term and bias.
Input shape
- 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
Output shape
- 2D tensor with shape: ``(batch_size, 1)``.
References
- [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
"""
def __init__(self):
super(FM, self).__init__()
def forward(self, inputs):
fm_input = inputs
square_of_sum = torch.pow(torch.sum(fm_input, dim=1, keepdim=True), 2)
sum_of_square = torch.sum(fm_input * fm_input, dim=1, keepdim=True)
cross_term = square_of_sum - sum_of_square
cross_term = 0.5 * torch.sum(cross_term, dim=2, keepdim=False)
return cross_term
import torch
import torch.nn as nn
from deepctr_torch.models.basemodel import BaseModel
from deepctr_torch.inputs import combined_dnn_input
from deepctr_torch.layers import FM, DNN
class DeepFM(BaseModel):
"""Instantiates the DeepFM Network architecture.
:param linear_feature_columns: An iterable containing all the features used by linear part of the model. (-> FM에 들어갈 피쳐, 전체 피쳐)
:param dnn_feature_columns: An iterable containing all the features used by deep part of the model. (-> DNN에 들어갈 피쳐, 전체 피쳐)
:param use_fm: bool,use FM part or not (FM 사용할지 말지)
:param dnn_hidden_units: list,list of positive integer or empty list,
the layer number and units in each layer of DNN (-> DNN 모델 layer 개수 - default 256, 128)
:param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.(->딥러닝 dropout)
:param dnn_activation: Activation function to use in DNN (-> 딥러닝 활성함수)
:param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN (->딥러닝 배치norm)
:param l2_reg_linear: float. L2 regularizer strength applied to linear part (-> FM l2 정규화 정도, defalut 1e-5)
:param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector (-> embedding l2 정규화 정도, defalut 1e-5)
:param l2_reg_dnn: float. L2 regularizer strength applied to DNN (-> dnn l2 정규화 정도, defalut 1e-5)
:param init_std: float,to use as the initialize std of embedding vector (-> 임베딩 초기 표준편차)
:param seed: integer ,to use as random seed. (-> 랜덤시드)
:param task: str, ``"binary"`` for binary logloss or ``"regression"`` for regression loss (->태스크 - 이진분류 / 회귀)
:param device: str, ``"cpu"`` or ``"cuda:0"`` (->cpu, gpu 선택)
:param gpus: list of int or torch.device for multiple gpus. If None, run on `device`. `gpus[0]` should be the same gpu with `device`.
:return: A PyTorch model instance.
"""
def __init__(self,
linear_feature_columns, dnn_feature_columns, use_fm=True,
dnn_hidden_units=(256, 128),
l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024,
dnn_dropout=0,
dnn_activation='relu', dnn_use_bn=False, task='binary', device='cpu', gpus=None):
super(DeepFM, self).__init__(linear_feature_columns, dnn_feature_columns, l2_reg_linear=l2_reg_linear,
l2_reg_embedding=l2_reg_embedding, init_std=init_std, seed=seed, task=task,
device=device, gpus=gpus)
self.use_fm = use_fm
self.use_dnn = len(dnn_feature_columns) > 0 and len(
dnn_hidden_units) > 0
if use_fm: ### FM model 로딩
self.fm = FM()
if self.use_dnn: ### dnn part 에서 쓰일 모델들 선언
self.dnn = DNN(self.compute_input_dim(dnn_feature_columns), dnn_hidden_units,
activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout, use_bn=dnn_use_bn,
init_std=init_std, device=device)
self.dnn_linear = nn.Linear(
dnn_hidden_units[-1], 1, bias=False).to(device)
self.add_regularization_weight(
filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2=l2_reg_dnn)
self.add_regularization_weight(self.dnn_linear.weight, l2=l2_reg_dnn)
self.to(device)
def forward(self, X): ### 학습
sparse_embedding_list, dense_value_list = self.input_from_feature_columns(X, self.dnn_feature_columns,
self.embedding_dict)
## 1) FM 연산
### 1.1) 선형 모델 통과
logit = self.linear_model(X) ## 결과값 추가
### 1.2) fm 연산 (칼럼끼리 곱)
if self.use_fm and len(sparse_embedding_list) > 0:
fm_input = torch.cat(sparse_embedding_list, dim=1)
logit += self.fm(fm_input) ## 결과값 추가
## 2) DNN 학습
if self.use_dnn:
dnn_input = combined_dnn_input(
sparse_embedding_list, dense_value_list)
dnn_output = self.dnn(dnn_input)
dnn_logit = self.dnn_linear(dnn_output)
logit += dnn_logit ## 결과값 추가
y_pred = self.out(logit)
return y_pred
댓글