01. goodbooks-10k Data Exploratory Analysis

베이스라인을 만들어 주기 전에 데이터 탐색이 필요한데, 처음 추천시스템을 만들 때 사용자의 타겟 군을 정해야 하고 어떤 상품을 추천해줄지 상품과 사용자의 특성을 파악하는 것이 중요하다. 이때 활용할 수 있는 것이 바로 데이터 탐색이다.


# 패키지 로드
import pandas as pd
import numpy as np
import plotnine
from plotnine import *
import os, sys, gc
from tqdm.notebook import tqdm

# 경로 설정 - 기존 강의 설정
path = '../input/t-academy-recommendation2/books/'
# 경로 설정 - 쿠키's drive.mount
from google.colab import drive
drive.mount('/content/drive')

이때

books.csv : 책의 메타정보

book_tags.csv : 책-태그의 매핑정보

ratings.csv : 사용자가 책에 대해 점수를 준 평점정보

tags.csv : 태그의 정보

to_read.csv :사용자가 읽으려고 기록해둔 책 정보


# 데이터 불러오기
books = pd.read_csv(path + "books.csv")
book_tags = pd.read_csv(path + "books_tags.csv")
ratings = pd.read_csv(path + "ratings.csv")
tags = pd,read_csv(path + "tags.csv")
to_read = pd.read_csv(path + "to_read.csv")

# 책의 메타정보
books.head()
books.columns
books['small_image_url].values[0]

이때 책의 경우에는 내용 외에 표지도 중요한 영향을 끼치기 때문에, 해당 책의 표지에서 이미지의 특징을 추출한 뒤 CNN 같은 모델로 유사한 책을 찾는 Contents Based Recommendation도 가능하다.


# 이 중에서 필요한 변수들만 사용
books = books[['book_id', 'authors', 'title', 'ratings_count', 'average_rating', 'language_code']].reset_index(drop=True)

# plotnine
## R : ggplot -> python : Plotnine
### R을 활용한 plotnine 코드
#### 각 저자별로 몇 권의 책을 썼는가 count
agg = books.groupby('authors')['authors'].agg({'count'})

##### 저자마다 총 1권~60권의 책을 쓰며,
			ggplot은 데이터 구조가 명확하기에 이를 통해 어떤 데이터를 사용할지 입력을 받고, histogram으로 column을 표현할 수 있음
(ggplot(data = agg)
  + geom_histogram(aes(x='count'), binwidth = 1, fill = '#49beb7')
  + labs(title = "Number of the Author's Book",
         x = 'Book Count', y = 'Author Count'))
  + theme_light()
    + theme(
        axis_text_x = element_text(color='black'),
        axis_text_y = element_text(color='black'),
        axis_line = element_line(color='black'),
        axis_ticks = element_line(color='grey'),
        figure_size = (10,6)
    )

# 책의 메타정보 의미
print("책의 숫자:", books['book_id'].nunique())
print("저자의 숫자:", books['authors'], nunique(), "|n")
print(pd.DataFrame(agg['count'].describe()).T)

# 저자 찾기
agg.sort_values(by.'count', ascending=False)

# 책의 평균평점
(ggplot(data = books)
  + geom_histogram(aes(x='average_rating'), binwidth = 0.1, fill = '#49beb7')
  + labs(title = "Average Rating of the Books",
         x = 'Average Rating', y = 'Book Count'))
  + theme_light()
    + theme(
        axis_text_x = element_text(color='black'),
        axis_text_y = element_text(color='black'),
        axis_line = element_line(color='black'),
        axis_ticks = element_line(color='grey'),
        figure_size = (10,6)
    )

평균평점을 보면, 3점 이하의 평점인 책이 14권 나오는데 이런 책들은 유사도가 아무리 높다고 하더라도 추천을 안하는 것이 학습에 유리할 수 있다. 따라서 이러한 책들은 추천해줄만한 이유가 있는지 살펴보고, 보통은 추천하지 않는 방향으로 가게 된다. 해당 내용은 아래와 같다.


# 추천할 필요가 없는 책 찾기
books[books['average_rating'] <== 3].shape[0]

# 책 찾기
books.sort_values(by='average_rating', ascending=False).head()

# 높은 평점대로 추천하기
(ggplot(data = books)
  + geom_histogram(aes(x='rating_count'), binwidth = 10000, fill = '#49beb7')
  + labs(title = "Rating Count of the Books",
         x = 'Rating Count', y = 'Book Count'))
  + theme_light()
    + theme(
        axis_text_x = element_text(color='black'),
        axis_text_y = element_text(color='black'),
        axis_line = element_line(color='black'),
        axis_ticks = element_line(color='grey'),
        figure_size = (10,6)
    )

평점이 높은 책들을 우선적으로 추천해주는 것이 좋을 때도 있으나, 평점이 높다고 하더라도 사람들이 오히려 많이 보지 않은 책, 즉 모수 자체가 적은 책일 수도 있기 때문에 항상 평점이 높은 책들만을 추천해주는 것은 옳지 않다.


pd.DataFrame(books['ratings_count'].describe()).T

# 데이터 탐색 과정에서 ratings_count와 average_rating을 비교
books.sort_values(by='rating_count', ascending=False).head()

# 책의 language 코드정보
agg = pd.DataFrame(books['language_code'].value_counts()).reset_index()
agg.columns = ['language_code', 'count']

(ggplot(data = books)
  + geom_histogram(aes(x='language_code'), binwidth = 10000, fill = '#49beb7')
  + labs(title = "Rating Count of the Books",
         x = 'Rating Count', y = 'Book Count'))
  + theme_light()
    + theme(
        axis_text_x = element_text(color='black'),
        axis_text_y = element_text(color='black'),
        axis_line = element_line(color='black'),
        axis_ticks = element_line(color='grey'),
        figure_size = (10,6)

메타 데이터로 인해 데이터 결측치와 부정확한 데이터들이 종종 보이는 현상이 생긴다.

국적에 맞는 언어로 된 책을 추천해주는 것이 필요하다.


books['language_code'].unique()

books.isnull().sum()

# 사용자가 책에 대해 내린 평점
ratings

# ratings에는 있지만 books에는 없는 책의 id의 수 계산
## set과 set의 difference length 계산, 즉 차집합의 전체 길이 계산
len(set(ratings['book_id'].unique()).difference(set(books['book_id'].unique())))

10,000개의 데이터 중 9,188개가 메타정보가 없는 것으로 나타났다. 즉, 사용자가 실제로 평점을 부여했으나 메타정보에 있는 책은 812권밖에 되지 않는 것이다. 따라서 이러한 내용을 종합해봤을 때, 컨텐츠 기반 추천시스템은 정확도가 정확하지 않아 성능이 좋지 않을 것이라고 예상할 수 있다. 따라서 협업 필터링이나 통계 기반 모델을 활용하는 것이 유리하다.

book의 tag 정보 확인하기

이때

book_tags : book_id에 매핑된 tag_id의 정보

tags : tag_id와 tag_name에 대한 매핑정보


# 태그 정보 살펴보기
book_tags.head()

# tag_id와 매핑되는 tag_name에 대한 정보를 담고 있는 테이블
tags.head()

# tag_id 기준으로 column 매핑하기
book_tags = pd.merge(tags, book_tags, how='left', on='tag_id')
book_tags

agg = book_tags.groupby(['tag_name'])['count'].agg({'sum}).reset_index()
agg = agg.sort_values(by='sum', ascending=False).reset_index(drop=True)
agg.head()

# 상위 20개 tag 찍기
(ggplot(data = agg.loc[0:20])
  + geom_bar(aes(x='tag_name', y='sum'), fill = '#49beb7', stat = "identity")
  + labs(title = "Top 20: Tag Count",
         x = 'Tag', y = 'Tag Count'))
  + theme_light()
    + theme(
        axis_text_x = element_text(color='black', rotation=60),
        axis_text_y = element_text(color='black'),
        axis_line = element_line(color='black'),
        axis_ticks = element_line(color='grey'),
        figure_size = (10,6))
)

이때 to_read 태그 모델이 다른 태그에 비해서 매우 높게 나오는 것을 볼 수 있다. 이를 통해 마치 TF-IDF에서 너무 많이 나오는 모델을 제거하는 것처럼, to_read 태그를 제거하고 다른 것끼리 비교하는 것도 하나의 방법이다.


# tag_count가 몇 번 나왔는지 기술통계값 구하기
pd.DataFrame(agg['sum'].describe()).T

책들 간의 편차가 매우 큰 것을 볼 수 있고, min은 매우 작고 max는 매우 큰 상황이다. 이러한 태그정보들을 통해서 내가 보는 태그의 글을 성향파악을 통해 추천해주는 것이 필요하고, tag가 유사한 책들을 추천해주는 것도 필요하다.

ratings 평점 정보 확인하기

전체 책과 사용자에 대해 기술 통계


# ratings 평점 정보 확인
# Average Number of the Read Count 관련 그래프
agg = ratings.groupby(['user_id'])['book_id'].agg({'count'}).reset_index()
(ggplot(data = agg)
  + geom_histogram(aes(x='count'), binwidth = 5, fill = '#49beb7')
  + labs(title = 'Average Number of the Read Count',
         x = 'Read Count',
         y = 'User Count')
  + theme_light() 
    + theme(
         axis_text_x = element_text(color='black'),
         axis_text_y = element_text(color='black'),
         axis_line=element_line(color="black"),
         axis_ticks=element_line(color = "grey"),
         figure_size=(10,6))    
)


pd.DataFrame(agg['count'].describe()).T

평균 한 사람이 읽는 책의 수는 18권

최소 2권씩은 구매해서 읽음

최대 많이 읽은 사람은 200권의 책을 구매함


# Average Readed Count 관련 그래프
agg = ratings.groupby(['book_id'])['book_id'].agg({'count'}).reset_index()
(ggplot(data=agg)
    + geom_histogram(aes(x='count', y='stat(count)'), fill = '#49beb7', binwidth=5)
    + theme_minimal()
    + ggtitle("Average Readed Count")
    + labs(x="Readed Count", y="binwidth") 
    + theme(
         axis_text_x = element_text(angle=60, color='black'),
         axis_text_y = element_text(color='black'),
         axis_line=element_line(color="black"),
         axis_ticks=element_line(color = "grey"),
         figure_size=(8,4))    
)


pd.DataFrame(agg['count'].describe()).T

책의 경우 최소 8명은 읽고, 많이 읽힌 책의 경우 100명이 읽음

편차는 크지만, 평점이 부여된 책들의 대부분이 100명씩은 읽은 책들만 뽑힌 것을 볼 수 있음


agg.head()
books[books['book_id'].isin([1, 2, 3, 4, 5, 6, 7, 8])].head()
ratings['user_id'].unique()
ratings[(ratings['user_id'] == 314) & (ratings['book_id'].isin([1,2,3,4,5,6,7,8]))]


# ratings를 8개 전부 다 구매한 사람 
agg = ratings[ratings['book_id'].isin([1,2,3,4,5,6,7,8])].groupby(['user_id'])['book_id'].agg({'nunique'})
agg = agg.reset_index()
agg = agg.groupby(['nunique'])['user_id'].agg({'count'}).reset_index()

(ggplot(data = agg)
  + geom_bar(aes(x='nunique', y='count'), fill = '#49beb7', stat = "identity")
  + labs(title = "Harry Poter's Reading Count",
         x = 'Series Count',
         y = 'Reaing Person Count')
  + theme_light() 
    + theme(
         axis_text_x = element_text(color='black'),
         axis_text_y = element_text(color='black'),
         axis_line=element_line(color="black"),
         axis_ticks=element_line(color = "grey"),
         figure_size=(10,6))    
)


agg['ratio'] = agg['count'] / agg['count'].sum()
agg[['nunique', 'ratio']].T

해리포터와 같이 시리즈성의 글들은 같이 읽는 경향이 있음

to read 정보


to_read.head()
to_read['user_id'].nunique()

이미 읽은 정보뿐만 아니라 읽을 책들에 대한 정보도 결합해서 추천이 가능

학습셋과 검증셋 생성


agg = ratings.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
agg.head()


agg['train'] = agg['unique'].apply(lambda x: np.random.choice(x, len(x)//2))
agg.head()


test = []
for i in tqdm(range(0, agg.shape[0])):
    test_rec = list(set(agg.loc[i, 'unique']).difference(set(agg.loc[i, 'train'])))
    test.append(test_rec)
agg['test'] = test


# train dataset 
train_map = agg[['user_id', 'train']]

# unnest tags
train_map_unnest = np.dstack(
    (
        np.repeat(train_map.user_id.values, list(map(len, train_map.train))), 
        np.concatenate(train_map.train.values)
    )
)

train_map_unnest = pd.DataFrame(data = train_map_unnest[0], columns = train_map.columns)
train_map_unnest.head()


# test dataset 
test_map = agg[['user_id', 'test']]

# unnest tags
test_map_unnest = np.dstack(
    (
        np.repeat(test_map.user_id.values, list(map(len, test_map.test))), 
        np.concatenate(test_map.test.values)
    )
)

test_map_unnest = pd.DataFrame(data = test_map_unnest[0], columns = test_map.columns)
test_map_unnest.head()


train_map_unnest.columns = ['user_id', 'book_id']
test_map_unnest.columns = ['user_id', 'book_id']
train_map_unnest.to_csv("train.csv", index=False)
test_map_unnest.to_csv("test.csv", index=False)

전체 책을 구매한 사용자는 53424명이고 책의 개수는 10000개

그 중 48871명이 장바구니에 책을 추가함

평균적으로 작가당 책이 2권 이상

02. goodbooks-10k Baseline Model


import pandas as pd
import numpy as np
import plotnine 
from plotnine import *
import os, sys, gc
from tqdm.notebook import tqdm


path = '../input/t-academy-recommendation2/book

books.csv : 책의 메타정보

book_tags.csv : 책-테크의 매핑정보

ratings.csv : 사용자가 책에 대해 점수를 준 평점 정보

tags.csv : 테크의 정보

to_read.csv : 사용자가 읽으려고 기록해둔 책(장바구니)


books = pd.read_csv(path + "books.csv")
book_tags = pd.read_csv(path + "book_tags.csv")
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
tags = pd.read_csv(path + "tags.csv")
to_read = pd.read_csv(path + "to_read.csv")


train['book_id'] = train['book_id'].astype(str)
test['book_id'] = test['book_id'].astype(str)
books['book_id'] = books['book_id'].astype(str)


sol = test.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
gt = {}
for user in tqdm(sol['user_id'].unique()): 
    gt[user] = list(sol[sol['user_id'] == user]['unique'].values[0])

한 사람당 100권의 책을 추천해주는 상황


rec_df = pd.DataFrame()
rec_df['user_id'] = train['user_id'].unique()

Baseline

통계기반의 모델


books.sort_values(by='books_count', ascending=False)[0:5]


popular_rec_model = books.sort_values(by='books_count', ascending=False)['book_id'].values[0:500]


total_rec_list = {}
for user in tqdm(rec_df['user_id'].unique()):
    rec_list = []
    for rec in popular_rec_model[0:200]: 
        rec_list.append(rec)
    total_rec_list[user] = rec_list


import six
import math

# https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py

class evaluate():
    def __init__(self, recs, gt, topn=100):
        self.recs = recs
        self.gt = gt 
        self.topn = topn 
        
    def _ndcg(self):
        Q, S = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            dcg = 0.0
            idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))])
            for i, r in enumerate(rec):
                if r not in seen:
                    continue
                rank = i + 1
                dcg += 1.0 / math.log(rank + 1, 2)
            ndcg = dcg / idcg
            S += ndcg
            Q += 1
        return S / Q


    def _map(self):
        n, ap = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            _ap, correct = 0.0, 0.0
            for i, r in enumerate(rec):
                if r in seen:
                    correct += 1
                    _ap += (correct / (i + 1.0))
            _ap /= min(len(seen), len(rec))
            ap += _ap
            n += 1.0
        return ap / n


    def _entropy_diversity(self):
        sz = float(len(self.recs)) * self.topn
        freq = {}
        for u, rec in six.iteritems(self.recs):
            for r in rec:
                freq[r] = freq.get(r, 0) + 1
        ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)])
        return ent
    
    def _evaluate(self):
        print('MAP@%s: %s' % (self.topn, self._map()))
        print('NDCG@%s: %s' % (self.topn, self._ndcg()))
        print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))


evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200)
evaluate_func._evaluate()

Baseline 응용

글중에서 평점이 높은 글들을 우선적으로 추천

좋아하는 작가의 글을 우선적으로 추천

장바구니에 담긴 글과 작가의 글을 우선적으로 추천

읽은 글의 시리즈글이 나오면 추천

최신의 글 추천


train = pd.merge(train, books[['book_id', 'authors', 'ratings_count']], how='left', on='book_id')


agg = train.groupby(['user_id','authors'])['authors'].agg({'count'}).reset_index()
agg = agg.sort_values(by='count', ascending=False)
agg.head()


author_books = books[['book_id', 'authors', 'ratings_count']].sort_values(by=['authors', 'ratings_count'], ascending=[True, False])
author_books = author_books.reset_index(drop=True)

author_books.head()


author_rec_model = agg.merge(author_books, how='left', on=['authors'])


author_rec_model.head()


author_rec_model[author_rec_model['user_id'] == 30944]['book_id'].values


total_rec_list = {}
for user in tqdm(rec_df['user_id'].unique()):
    rec_list = []
    author_rec_model_ = author_rec_model[author_rec_model['user_id'] == user]['book_id'].values
    for rec in author_rec_model_: 
        rec_list.append(rec)
    
    if len(rec_list) < 200:
        for i in popular_rec_model[0:200]:
            rec_list.append(rec)
        
    total_rec_list[user] = rec_list[0:20


evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200)
evaluate_func._evaluate()

후처리

읽은 책은 추천해주면 안됨

읽은 언어와 맞는 책을 추천해줘야함


# 내가 읽은 책의 목록을 추출 
read_list = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
read_list.head()


total_rec_list = {}
for user in tqdm(rec_df['user_id'].unique()):
    rec_list = []
    author_rec_model_ = author_rec_model[author_rec_model['user_id'] == user]['book_id'].values
    seen = read_list[read_list['user_id'] == user]['unique'].values[0]
    for rec in author_rec_model_: 
        if rec not in seen:
            rec_list.append(rec)
    
    if len(rec_list) < 200:
        for i in popular_rec_model[0:200]:
            if rec not in seen:
                rec_list.append(rec)

    total_rec_list[user] = rec_list[0:200]


evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200)
evaluate_func._evaluate()


# 내가 읽을 수 있는 언어의 목록을 추출 
## User에 대한 메타정보가 있으면 쉽게 추출가능하지만, 현재는 없으므로 직접 생성 
## Ratings에서 읽은 책들의 언어를 전부 수집해서 해당 언어의 책들을 가능한 언어로 설정 
language = pd.merge(train, books[['book_id', 'language_code']], how='left', on='book_id')


language_list = language.groupby(['user_id'])['language_code'].agg({'unique'}).reset_index()
language_list.head()

03. goodbooks-10k Collaborative Filtering Model

여기서는 collaborative filtering model을 사용하여 사용자들에게 추천을 해주는 코드를 작성해 보겠습니다.


#필요한 라이브러리 불러오기
import pandas as pd
import numpy as np
import plotnine 
from plotnine import *
import os, sys, gc
from tqdm.notebook import tqdm


#경로 설정하기
path = '../input/t-academy-recommendation2/books/'


#경로로 부터 csv파일 가져오기
books = pd.read_csv(path + "books.csv")
book_tags = pd.read_csv(path + "book_tags.csv")
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
tags = pd.read_csv(path + "tags.csv")
to_read = pd.read_csv(path + "to_read.csv")


#book_id값을 string으로 바꾸기
train['book_id'] = train['book_id'].astype(str)
test['book_id'] = test['book_id'].astype(str)
books['book_id'] = books['book_id'].astype(str)

총 23개의 column으로 구성되어 있지만 가독성을 위해 뒷부분을 잘라서 가져왔습니다. 여기서 추천을 위해 books_count로 정렬된 book_id를 추출하였습니다.


popular_rec_model = books.sort_values(by='books_count', ascending=False)['book_id'].values[0:500]

사용자에게 적절한 추천을 해주기 위해 user_id를 기준으로 읽었던 book_id들을 모두 묶어줍니다.


sol = test.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
gt = {}
for user in tqdm(sol['user_id'].unique()): 
    gt[user] = list(sol[sol['user_id'] == user]['unique'].values[0])


rec_df = pd.DataFrame()
rec_df['user_id'] = train['user_id'].unique()

여기서 부터는 Matrix Factorization에 관한 코드입니다. fit, cost, gradient, gradient_descent, get_prediction, get_complete_matrix 함수로 이루어져 있습니다.

fit: epoch만큼 반복하면서 cost를 계산하고 gradient descent를 진행하는 함수

cost: 예측과 실제 값의 오차제곱합을 구하는 함수

gradient: 기울기를 구하는 함수

gradient_descent: 기울기와 learning_rate를 가지고 matrix를 갱신하는 함수

get_prediction: 행렬분해를 통해 얻은 예측값을 구하는 함수


import numpy as np
from tqdm import tqdm_notebook as tqdm

import numpy as np

# Base code : https://yamalab.tistory.com/92
class MatrixFactorization():
    def __init__(self, R, k, learning_rate, reg_param, epochs, verbose=False):
        """
        :param R: rating matrix
        :param k: latent parameter
        :param learning_rate: alpha on weight update
        :param reg_param: beta on weight update
        :param epochs: training epochs
        :param verbose: print status
        """
        self._R = R
        self._num_users, self._num_items = R.shape
        self._k = k
        self._learning_rate = learning_rate
        self._reg_param = reg_param
        self._epochs = epochs
        self._verbose = verbose


    def fit(self):
        """
        training Matrix Factorization : Update matrix latent weight and bias

        참고: self._b에 대한 설명
        - global bias: input R에서 평가가 매겨진 rating의 평균값을 global bias로 사용
        - 정규화 기능. 최종 rating에 음수가 들어가는 것 대신 latent feature에 음수가 포함되도록 해줌.

        :return: training_process
        """

        # init latent features
        self._P = np.random.normal(size=(self._num_users, self._k))
        self._Q = np.random.normal(size=(self._num_items, self._k))

        # init biases
        self._b_P = np.zeros(self._num_users)
        self._b_Q = np.zeros(self._num_items)
        self._b = np.mean(self._R[np.where(self._R != 0)])

        # train while epochs
        self._training_process = []
        for epoch in range(self._epochs):
            # rating이 존재하는 index를 기준으로 training
            xi, yi = self._R.nonzero()
            for i, j in zip(xi, yi):
                self.gradient_descent(i, j, self._R[i, j])
            cost = self.cost()
            self._training_process.append((epoch, cost))

            # print status
            if self._verbose == True and ((epoch + 1) % 10 == 0):
                print("Iteration: %d ; cost = %.4f" % (epoch + 1, cost))


    def cost(self):
        """
        compute root mean square error
        :return: rmse cost
        """

        # xi, yi: R[xi, yi]는 nonzero인 value를 의미한다.
        # 참고: http://codepractice.tistory.com/90
        xi, yi = self._R.nonzero()
        # predicted = self.get_complete_matrix()
        cost = 0
        for x, y in zip(xi, yi):
            cost += pow(self._R[x, y] - self.get_prediction(x, y), 2)
        return np.sqrt(cost/len(xi))


    def gradient(self, error, i, j):
        """
        gradient of latent feature for GD

        :param error: rating - prediction error
        :param i: user index
        :param j: item index
        :return: gradient of latent feature tuple
        """

        dp = (error * self._Q[j, :]) - (self._reg_param * self._P[i, :])
        dq = (error * self._P[i, :]) - (self._reg_param * self._Q[j, :])
        return dp, dq


    def gradient_descent(self, i, j, rating):
        """
        graident descent function

        :param i: user index of matrix
        :param j: item index of matrix
        :param rating: rating of (i,j)
        """

        # get error
        prediction = self.get_prediction(i, j)
        error = rating - prediction

        # update biases
        self._b_P[i] += self._learning_rate * (error - self._reg_param * self._b_P[i])
        self._b_Q[j] += self._learning_rate * (error - self._reg_param * self._b_Q[j])

        # update latent feature
        dp, dq = self.gradient(error, i, j)
        self._P[i, :] += self._learning_rate * dp
        self._Q[j, :] += self._learning_rate * dq


    def get_prediction(self, i, j):
        """
        get predicted rating: user_i, item_j
        :return: prediction of r_ij
        """
        return self._b + self._b_P[i] + self._b_Q[j] + self._P[i, :].dot(self._Q[j, :].T)


    def get_complete_matrix(self):
        """
        computer complete matrix PXQ + P.bias + Q.bias + global bias

        - PXQ 행렬에 b_P[:, np.newaxis]를 더하는 것은 각 열마다 bias를 더해주는 것
        - b_Q[np.newaxis:, ]를 더하는 것은 각 행마다 bias를 더해주는 것
        - b를 더하는 것은 각 element마다 bias를 더해주는 것

        - newaxis: 차원을 추가해줌. 1차원인 Latent들로 2차원의 R에 행/열 단위 연산을 해주기위해 차원을 추가하는 것.

        :return: complete matrix R^
        """
        return self._b + self._b_P[:, np.newaxis] + self._b_Q[np.newaxis:, ] + self._P.dot(self._Q.T)

정의한 MatrixFactorization을 사용하는 과정입니다.


%%time
factorizer = MatrixFactorization(R, k=20, learning_rate=0.01, reg_param=0.01, epochs=100, verbose=True)
factorizer.fit()


sgd_rec_model = factorizer.get_complete_matrix()


# 내가 읽은 책의 목록을 추출 
read_list = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
read_list.head()


total_rec_list = {}
for user in tqdm(data['useridx'].unique()):
    rec_list = []
    
    # 기존에 만든 Book ID를 변경 
    rating_scores = [(idx2book[i], c) for i, c in enumerate(sgd_rec_model[user]) if i != user] # 자기 자신이 추천안되도록 
    rating_scores = sorted(rating_scores, key = lambda x: x[1], reverse=True) # 평점이 높은 순서대로 정렬 
    
    seen = read_list[read_list['user_id'] == idx2user[user]]['unique'].values[0]
    for rec in rating_scores[0:250]: 
        if rec[0] not in seen:
            rec_list.append(rec[0])
    
    if len(rec_list) < 200:
        for i in popular_rec_model[0:200]:
            if rec not in seen:
                rec_list.append(rec)

    total_rec_list[idx2user[user]] = rec_list[0:200]

Map, NDCG, entropy_diversity라는 3가지 평가함수에 대해 정의한 코드입니다.


import six
import math

# https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py

class evaluate():
    def __init__(self, recs, gt, topn=100):
        self.recs = recs
        self.gt = gt 
        self.topn = topn 
        
    def _ndcg(self):
        Q, S = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            dcg = 0.0
            idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))])
            for i, r in enumerate(rec):
                if r not in seen:
                    continue
                rank = i + 1
                dcg += 1.0 / math.log(rank + 1, 2)
            ndcg = dcg / idcg
            S += ndcg
            Q += 1
        return S / Q


    def _map(self):
        n, ap = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            _ap, correct = 0.0, 0.0
            for i, r in enumerate(rec):
                if r in seen:
                    correct += 1
                    _ap += (correct / (i + 1.0))
            _ap /= min(len(seen), len(rec))
            ap += _ap
            n += 1.0
        return ap / n


    def _entropy_diversity(self):
        sz = float(len(self.recs)) * self.topn
        freq = {}
        for u, rec in six.iteritems(self.recs):
            for r in rec:
                freq[r] = freq.get(r, 0) + 1
        ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)])
        return ent
    
    def _evaluate(self):
        print('MAP@%s: %s' % (self.topn, self._map()))
        print('NDCG@%s: %s' % (self.topn, self._ndcg()))
        print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))


evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200)
evaluate_func._evaluate()

ALS를 사용하여 코드를 구현하였습니다.


from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from implicit.bpr import BayesianPersonalizedRanking as BPR

als_model = ALS(factors=20, regularization=0.01, iterations = 100)
als_model.fit(purchase_sparse.T)


als_model.recommend(0, purchase_sparse, N=200)[0:10]


total_rec_list = {}
for user in tqdm(data['useridx'].unique()):
    rec_list = []
    
    # 기존에 만든 Book ID를 변경 
    seen = read_list[read_list['user_id'] == idx2user[user]]['unique'].values[0]
    recs = als_model.recommend(user, purchase_sparse, N=250)
    recs = [idx2book[x[0]] for x in recs][0:250]  
    
    for rec in recs: 
        if rec not in seen:
            rec_list.append(rec)
    
    if len(rec_list) < 200:
        for i in popular_rec_model[0:200]:
            if rec not in seen:
                rec_list.append(rec)

    total_rec_list[idx2user[user]] = rec_list[0:200]


evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200)
evaluate_func._evaluate()

6강 - Goodbooks-10k 데이터를 이용한 추천시스템 실습