ย
ย
01. goodbooks-10k Data Exploratory Analysis
ย
๋ฒ ์ด์ค๋ผ์ธ์ ๋ง๋ค์ด ์ฃผ๊ธฐ ์ ์ ๋ฐ์ดํฐ ํ์์ด ํ์ํ๋ฐ, ์ฒ์ ์ถ์ฒ์์คํ
์ ๋ง๋ค ๋ ์ฌ์ฉ์์ ํ๊ฒ ๊ตฐ์ ์ ํด์ผ ํ๊ณ ์ด๋ค ์ํ์ ์ถ์ฒํด์ค์ง ์ํ๊ณผ ์ฌ์ฉ์์ ํน์ฑ์ ํ์
ํ๋ ๊ฒ์ด ์ค์ํ๋ค. ์ด๋ ํ์ฉํ ์ ์๋ ๊ฒ์ด ๋ฐ๋ก ๋ฐ์ดํฐ ํ์์ด๋ค.
# ํจํค์ง ๋ก๋ import pandas as pd import numpy as np import plotnine from plotnine import * import os, sys, gc from tqdm.notebook import tqdm # ๊ฒฝ๋ก ์ค์ - ๊ธฐ์กด ๊ฐ์ ์ค์ path = '../input/t-academy-recommendation2/books/' # ๊ฒฝ๋ก ์ค์ - ์ฟ ํค's drive.mount from google.colab import drive drive.mount('/content/drive')
์ด๋
books.csv : ์ฑ
์ ๋ฉํ์ ๋ณด
book_tags.csv : ์ฑ
-ํ๊ทธ์ ๋งคํ์ ๋ณด
ratings.csv : ์ฌ์ฉ์๊ฐ ์ฑ
์ ๋ํด ์ ์๋ฅผ ์ค ํ์ ์ ๋ณด
tags.csv : ํ๊ทธ์ ์ ๋ณด
to_read.csv :์ฌ์ฉ์๊ฐ ์ฝ์ผ๋ ค๊ณ ๊ธฐ๋กํด๋ ์ฑ
์ ๋ณด
ย
# ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ books = pd.read_csv(path + "books.csv") book_tags = pd.read_csv(path + "books_tags.csv") ratings = pd.read_csv(path + "ratings.csv") tags = pd,read_csv(path + "tags.csv") to_read = pd.read_csv(path + "to_read.csv") # ์ฑ ์ ๋ฉํ์ ๋ณด books.head() books.columns books['small_image_url].values[0]
์ด๋ ์ฑ
์ ๊ฒฝ์ฐ์๋ ๋ด์ฉ ์ธ์ ํ์ง๋ ์ค์ํ ์ํฅ์ ๋ผ์น๊ธฐ ๋๋ฌธ์, ํด๋น ์ฑ
์ ํ์ง์์ ์ด๋ฏธ์ง์ ํน์ง์ ์ถ์ถํ ๋ค CNN ๊ฐ์ ๋ชจ๋ธ๋ก ์ ์ฌํ ์ฑ
์ ์ฐพ๋ Contents Based Recommendation๋ ๊ฐ๋ฅํ๋ค.
ย
# ์ด ์ค์์ ํ์ํ ๋ณ์๋ค๋ง ์ฌ์ฉ books = books[['book_id', 'authors', 'title', 'ratings_count', 'average_rating', 'language_code']].reset_index(drop=True) # plotnine ## R : ggplot -> python : Plotnine ### R์ ํ์ฉํ plotnine ์ฝ๋ #### ๊ฐ ์ ์๋ณ๋ก ๋ช ๊ถ์ ์ฑ ์ ์ผ๋๊ฐ count agg = books.groupby('authors')['authors'].agg({'count'}) ##### ์ ์๋ง๋ค ์ด 1๊ถ~60๊ถ์ ์ฑ ์ ์ฐ๋ฉฐ, ggplot์ ๋ฐ์ดํฐ ๊ตฌ์กฐ๊ฐ ๋ช ํํ๊ธฐ์ ์ด๋ฅผ ํตํด ์ด๋ค ๋ฐ์ดํฐ๋ฅผ ์ฌ์ฉํ ์ง ์ ๋ ฅ์ ๋ฐ๊ณ , histogram์ผ๋ก column์ ํํํ ์ ์์ (ggplot(data = agg) + geom_histogram(aes(x='count'), binwidth = 1, fill = '#49beb7') + labs(title = "Number of the Author's Book", x = 'Book Count', y = 'Author Count')) + theme_light() + theme( axis_text_x = element_text(color='black'), axis_text_y = element_text(color='black'), axis_line = element_line(color='black'), axis_ticks = element_line(color='grey'), figure_size = (10,6) ) # ์ฑ ์ ๋ฉํ์ ๋ณด ์๋ฏธ print("์ฑ ์ ์ซ์:", books['book_id'].nunique()) print("์ ์์ ์ซ์:", books['authors'], nunique(), "|n") print(pd.DataFrame(agg['count'].describe()).T) # ์ ์ ์ฐพ๊ธฐ agg.sort_values(by.'count', ascending=False) # ์ฑ ์ ํ๊ท ํ์ (ggplot(data = books) + geom_histogram(aes(x='average_rating'), binwidth = 0.1, fill = '#49beb7') + labs(title = "Average Rating of the Books", x = 'Average Rating', y = 'Book Count')) + theme_light() + theme( axis_text_x = element_text(color='black'), axis_text_y = element_text(color='black'), axis_line = element_line(color='black'), axis_ticks = element_line(color='grey'), figure_size = (10,6) )
ย
ํ๊ท ํ์ ์ ๋ณด๋ฉด, 3์ ์ดํ์ ํ์ ์ธ ์ฑ
์ด 14๊ถ ๋์ค๋๋ฐ ์ด๋ฐ ์ฑ
๋ค์ ์ ์ฌ๋๊ฐ ์๋ฌด๋ฆฌ ๋๋ค๊ณ ํ๋๋ผ๋ ์ถ์ฒ์ ์ํ๋ ๊ฒ์ด ํ์ต์ ์ ๋ฆฌํ ์ ์๋ค. ๋ฐ๋ผ์ ์ด๋ฌํ ์ฑ
๋ค์ ์ถ์ฒํด์ค๋งํ ์ด์ ๊ฐ ์๋์ง ์ดํด๋ณด๊ณ , ๋ณดํต์ ์ถ์ฒํ์ง ์๋ ๋ฐฉํฅ์ผ๋ก ๊ฐ๊ฒ ๋๋ค. ํด๋น ๋ด์ฉ์ ์๋์ ๊ฐ๋ค.
# ์ถ์ฒํ ํ์๊ฐ ์๋ ์ฑ ์ฐพ๊ธฐ books[books['average_rating'] <== 3].shape[0] # ์ฑ ์ฐพ๊ธฐ books.sort_values(by='average_rating', ascending=False).head() # ๋์ ํ์ ๋๋ก ์ถ์ฒํ๊ธฐ (ggplot(data = books) + geom_histogram(aes(x='rating_count'), binwidth = 10000, fill = '#49beb7') + labs(title = "Rating Count of the Books", x = 'Rating Count', y = 'Book Count')) + theme_light() + theme( axis_text_x = element_text(color='black'), axis_text_y = element_text(color='black'), axis_line = element_line(color='black'), axis_ticks = element_line(color='grey'), figure_size = (10,6) )
ย
ํ์ ์ด ๋์ ์ฑ
๋ค์ ์ฐ์ ์ ์ผ๋ก ์ถ์ฒํด์ฃผ๋ ๊ฒ์ด ์ข์ ๋๋ ์์ผ๋, ํ์ ์ด ๋๋ค๊ณ ํ๋๋ผ๋ ์ฌ๋๋ค์ด ์คํ๋ ค ๋ง์ด ๋ณด์ง ์์ ์ฑ
, ์ฆ ๋ชจ์ ์์ฒด๊ฐ ์ ์ ์ฑ
์ผ ์๋ ์๊ธฐ ๋๋ฌธ์ ํญ์ ํ์ ์ด ๋์ ์ฑ
๋ค๋ง์ ์ถ์ฒํด์ฃผ๋ ๊ฒ์ ์ณ์ง ์๋ค.
pd.DataFrame(books['ratings_count'].describe()).T # ๋ฐ์ดํฐ ํ์ ๊ณผ์ ์์ ratings_count์ average_rating์ ๋น๊ต books.sort_values(by='rating_count', ascending=False).head() # ์ฑ ์ language ์ฝ๋์ ๋ณด agg = pd.DataFrame(books['language_code'].value_counts()).reset_index() agg.columns = ['language_code', 'count'] (ggplot(data = books) + geom_histogram(aes(x='language_code'), binwidth = 10000, fill = '#49beb7') + labs(title = "Rating Count of the Books", x = 'Rating Count', y = 'Book Count')) + theme_light() + theme( axis_text_x = element_text(color='black'), axis_text_y = element_text(color='black'), axis_line = element_line(color='black'), axis_ticks = element_line(color='grey'), figure_size = (10,6)
๋ฉํ ๋ฐ์ดํฐ๋ก ์ธํด ๋ฐ์ดํฐ ๊ฒฐ์ธก์น์ ๋ถ์ ํํ ๋ฐ์ดํฐ๋ค์ด ์ข
์ข
๋ณด์ด๋ ํ์์ด ์๊ธด๋ค.
ย
๊ตญ์ ์ ๋ง๋ ์ธ์ด๋ก ๋ ์ฑ
์ ์ถ์ฒํด์ฃผ๋ ๊ฒ์ด ํ์ํ๋ค.
books['language_code'].unique() books.isnull().sum() # ์ฌ์ฉ์๊ฐ ์ฑ ์ ๋ํด ๋ด๋ฆฐ ํ์ ratings # ratings์๋ ์์ง๋ง books์๋ ์๋ ์ฑ ์ id์ ์ ๊ณ์ฐ ## set๊ณผ set์ difference length ๊ณ์ฐ, ์ฆ ์ฐจ์งํฉ์ ์ ์ฒด ๊ธธ์ด ๊ณ์ฐ len(set(ratings['book_id'].unique()).difference(set(books['book_id'].unique())))
10,000๊ฐ์ ๋ฐ์ดํฐ ์ค 9,188๊ฐ๊ฐ ๋ฉํ์ ๋ณด๊ฐ ์๋ ๊ฒ์ผ๋ก ๋ํ๋ฌ๋ค. ์ฆ, ์ฌ์ฉ์๊ฐ ์ค์ ๋ก ํ์ ์ ๋ถ์ฌํ์ผ๋ ๋ฉํ์ ๋ณด์ ์๋ ์ฑ
์ 812๊ถ๋ฐ์ ๋์ง ์๋ ๊ฒ์ด๋ค. ๋ฐ๋ผ์ ์ด๋ฌํ ๋ด์ฉ์ ์ข
ํฉํด๋ดค์ ๋, ์ปจํ
์ธ ๊ธฐ๋ฐ ์ถ์ฒ์์คํ
์ ์ ํ๋๊ฐ ์ ํํ์ง ์์ ์ฑ๋ฅ์ด ์ข์ง ์์ ๊ฒ์ด๋ผ๊ณ ์์ํ ์ ์๋ค. ๋ฐ๋ผ์ ํ์
ํํฐ๋ง์ด๋ ํต๊ณ ๊ธฐ๋ฐ ๋ชจ๋ธ์ ํ์ฉํ๋ ๊ฒ์ด ์ ๋ฆฌํ๋ค.
ย
ย
book์ tag ์ ๋ณด ํ์ธํ๊ธฐ
์ด๋
book_tags : book_id์ ๋งคํ๋ tag_id์ ์ ๋ณด
tags : tag_id์ tag_name์ ๋ํ ๋งคํ์ ๋ณด
ย
# ํ๊ทธ ์ ๋ณด ์ดํด๋ณด๊ธฐ book_tags.head() # tag_id์ ๋งคํ๋๋ tag_name์ ๋ํ ์ ๋ณด๋ฅผ ๋ด๊ณ ์๋ ํ ์ด๋ธ tags.head() # tag_id ๊ธฐ์ค์ผ๋ก column ๋งคํํ๊ธฐ book_tags = pd.merge(tags, book_tags, how='left', on='tag_id') book_tags agg = book_tags.groupby(['tag_name'])['count'].agg({'sum}).reset_index() agg = agg.sort_values(by='sum', ascending=False).reset_index(drop=True) agg.head() # ์์ 20๊ฐ tag ์ฐ๊ธฐ (ggplot(data = agg.loc[0:20]) + geom_bar(aes(x='tag_name', y='sum'), fill = '#49beb7', stat = "identity") + labs(title = "Top 20: Tag Count", x = 'Tag', y = 'Tag Count')) + theme_light() + theme( axis_text_x = element_text(color='black', rotation=60), axis_text_y = element_text(color='black'), axis_line = element_line(color='black'), axis_ticks = element_line(color='grey'), figure_size = (10,6)) )
์ด๋ to_read ํ๊ทธ ๋ชจ๋ธ์ด ๋ค๋ฅธ ํ๊ทธ์ ๋นํด์ ๋งค์ฐ ๋๊ฒ ๋์ค๋ ๊ฒ์ ๋ณผ ์ ์๋ค. ์ด๋ฅผ ํตํด ๋ง์น TF-IDF์์ ๋๋ฌด ๋ง์ด ๋์ค๋ ๋ชจ๋ธ์ ์ ๊ฑฐํ๋ ๊ฒ์ฒ๋ผ, to_read ํ๊ทธ๋ฅผ ์ ๊ฑฐํ๊ณ ๋ค๋ฅธ ๊ฒ๋ผ๋ฆฌ ๋น๊ตํ๋ ๊ฒ๋ ํ๋์ ๋ฐฉ๋ฒ์ด๋ค.
ย
# tag_count๊ฐ ๋ช ๋ฒ ๋์๋์ง ๊ธฐ์ ํต๊ณ๊ฐ ๊ตฌํ๊ธฐ pd.DataFrame(agg['sum'].describe()).T
์ฑ
๋ค ๊ฐ์ ํธ์ฐจ๊ฐ ๋งค์ฐ ํฐ ๊ฒ์ ๋ณผ ์ ์๊ณ , min์ ๋งค์ฐ ์๊ณ max๋ ๋งค์ฐ ํฐ ์ํฉ์ด๋ค. ์ด๋ฌํ ํ๊ทธ์ ๋ณด๋ค์ ํตํด์ ๋ด๊ฐ ๋ณด๋ ํ๊ทธ์ ๊ธ์ ์ฑํฅํ์
์ ํตํด ์ถ์ฒํด์ฃผ๋ ๊ฒ์ด ํ์ํ๊ณ , tag๊ฐ ์ ์ฌํ ์ฑ
๋ค์ ์ถ์ฒํด์ฃผ๋ ๊ฒ๋ ํ์ํ๋ค.
ย
ย
ratings ํ์ ์ ๋ณด ํ์ธํ๊ธฐ
- ์ ์ฒด ์ฑ ๊ณผ ์ฌ์ฉ์์ ๋ํด ๊ธฐ์ ํต๊ณ
# ratings ํ์ ์ ๋ณด ํ์ธ # Average Number of the Read Count ๊ด๋ จ ๊ทธ๋ํ agg = ratings.groupby(['user_id'])['book_id'].agg({'count'}).reset_index() (ggplot(data = agg) + geom_histogram(aes(x='count'), binwidth = 5, fill = '#49beb7') + labs(title = 'Average Number of the Read Count', x = 'Read Count', y = 'User Count') + theme_light() + theme( axis_text_x = element_text(color='black'), axis_text_y = element_text(color='black'), axis_line=element_line(color="black"), axis_ticks=element_line(color = "grey"), figure_size=(10,6)) )
pd.DataFrame(agg['count'].describe()).T
- ํ๊ท ํ ์ฌ๋์ด ์ฝ๋ ์ฑ ์ ์๋ 18๊ถ
- ์ต์ 2๊ถ์ฉ์ ๊ตฌ๋งคํด์ ์ฝ์
- ์ต๋ ๋ง์ด ์ฝ์ ์ฌ๋์ 200๊ถ์ ์ฑ ์ ๊ตฌ๋งคํจ
# Average Readed Count ๊ด๋ จ ๊ทธ๋ํ agg = ratings.groupby(['book_id'])['book_id'].agg({'count'}).reset_index() (ggplot(data=agg) + geom_histogram(aes(x='count', y='stat(count)'), fill = '#49beb7', binwidth=5) + theme_minimal() + ggtitle("Average Readed Count") + labs(x="Readed Count", y="binwidth") + theme( axis_text_x = element_text(angle=60, color='black'), axis_text_y = element_text(color='black'), axis_line=element_line(color="black"), axis_ticks=element_line(color = "grey"), figure_size=(8,4)) )
pd.DataFrame(agg['count'].describe()).T
- ์ฑ ์ ๊ฒฝ์ฐ ์ต์ 8๋ช ์ ์ฝ๊ณ , ๋ง์ด ์ฝํ ์ฑ ์ ๊ฒฝ์ฐ 100๋ช ์ด ์ฝ์
- ํธ์ฐจ๋ ํฌ์ง๋ง, ํ์ ์ด ๋ถ์ฌ๋ ์ฑ ๋ค์ ๋๋ถ๋ถ์ด 100๋ช ์ฉ์ ์ฝ์ ์ฑ ๋ค๋ง ๋ฝํ ๊ฒ์ ๋ณผ ์ ์์
agg.head() books[books['book_id'].isin([1, 2, 3, 4, 5, 6, 7, 8])].head() ratings['user_id'].unique() ratings[(ratings['user_id'] == 314) & (ratings['book_id'].isin([1,2,3,4,5,6,7,8]))]
# ratings๋ฅผ 8๊ฐ ์ ๋ถ ๋ค ๊ตฌ๋งคํ ์ฌ๋ agg = ratings[ratings['book_id'].isin([1,2,3,4,5,6,7,8])].groupby(['user_id'])['book_id'].agg({'nunique'}) agg = agg.reset_index() agg = agg.groupby(['nunique'])['user_id'].agg({'count'}).reset_index() (ggplot(data = agg) + geom_bar(aes(x='nunique', y='count'), fill = '#49beb7', stat = "identity") + labs(title = "Harry Poter's Reading Count", x = 'Series Count', y = 'Reaing Person Count') + theme_light() + theme( axis_text_x = element_text(color='black'), axis_text_y = element_text(color='black'), axis_line=element_line(color="black"), axis_ticks=element_line(color = "grey"), figure_size=(10,6)) )
agg['ratio'] = agg['count'] / agg['count'].sum() agg[['nunique', 'ratio']].T
- ํด๋ฆฌํฌํฐ์ ๊ฐ์ด ์๋ฆฌ์ฆ์ฑ์ ๊ธ๋ค์ ๊ฐ์ด ์ฝ๋ ๊ฒฝํฅ์ด ์์
to read ์ ๋ณด
to_read.head() to_read['user_id'].nunique()
- ์ด๋ฏธ ์ฝ์ ์ ๋ณด๋ฟ๋ง ์๋๋ผ ์ฝ์ ์ฑ ๋ค์ ๋ํ ์ ๋ณด๋ ๊ฒฐํฉํด์ ์ถ์ฒ์ด ๊ฐ๋ฅ
ํ์ต์ ๊ณผ ๊ฒ์ฆ์ ์์ฑ
agg = ratings.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index() agg.head()
agg['train'] = agg['unique'].apply(lambda x: np.random.choice(x, len(x)//2)) agg.head()
test = [] for i in tqdm(range(0, agg.shape[0])): test_rec = list(set(agg.loc[i, 'unique']).difference(set(agg.loc[i, 'train']))) test.append(test_rec) agg['test'] = test
# train dataset train_map = agg[['user_id', 'train']] # unnest tags train_map_unnest = np.dstack( ( np.repeat(train_map.user_id.values, list(map(len, train_map.train))), np.concatenate(train_map.train.values) ) ) train_map_unnest = pd.DataFrame(data = train_map_unnest[0], columns = train_map.columns) train_map_unnest.head()
# test dataset test_map = agg[['user_id', 'test']] # unnest tags test_map_unnest = np.dstack( ( np.repeat(test_map.user_id.values, list(map(len, test_map.test))), np.concatenate(test_map.test.values) ) ) test_map_unnest = pd.DataFrame(data = test_map_unnest[0], columns = test_map.columns) test_map_unnest.head()
train_map_unnest.columns = ['user_id', 'book_id'] test_map_unnest.columns = ['user_id', 'book_id'] train_map_unnest.to_csv("train.csv", index=False) test_map_unnest.to_csv("test.csv", index=False)
- ์ ์ฒด ์ฑ ์ ๊ตฌ๋งคํ ์ฌ์ฉ์๋ 53424๋ช ์ด๊ณ ์ฑ ์ ๊ฐ์๋ 10000๊ฐ
- ๊ทธ ์ค 48871๋ช ์ด ์ฅ๋ฐ๊ตฌ๋์ ์ฑ ์ ์ถ๊ฐํจ
- ํ๊ท ์ ์ผ๋ก ์๊ฐ๋น ์ฑ ์ด 2๊ถ ์ด์
ย
ย
02. goodbooks-10k Baseline Model
import pandas as pd import numpy as np import plotnine from plotnine import * import os, sys, gc from tqdm.notebook import tqdm
path = '../input/t-academy-recommendation2/book
- books.csv : ์ฑ ์ ๋ฉํ์ ๋ณด
- book_tags.csv : ์ฑ -ํ ํฌ์ ๋งคํ์ ๋ณด
- ratings.csv : ์ฌ์ฉ์๊ฐ ์ฑ ์ ๋ํด ์ ์๋ฅผ ์ค ํ์ ์ ๋ณด
- tags.csv : ํ ํฌ์ ์ ๋ณด
- to_read.csv : ์ฌ์ฉ์๊ฐ ์ฝ์ผ๋ ค๊ณ ๊ธฐ๋กํด๋ ์ฑ (์ฅ๋ฐ๊ตฌ๋)
books = pd.read_csv(path + "books.csv") book_tags = pd.read_csv(path + "book_tags.csv") train = pd.read_csv(path + "train.csv") test = pd.read_csv(path + "test.csv") tags = pd.read_csv(path + "tags.csv") to_read = pd.read_csv(path + "to_read.csv")
train['book_id'] = train['book_id'].astype(str) test['book_id'] = test['book_id'].astype(str) books['book_id'] = books['book_id'].astype(str)
sol = test.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index() gt = {} for user in tqdm(sol['user_id'].unique()): gt[user] = list(sol[sol['user_id'] == user]['unique'].values[0])
ํ ์ฌ๋๋น 100๊ถ์ ์ฑ ์ ์ถ์ฒํด์ฃผ๋ ์ํฉ
rec_df = pd.DataFrame() rec_df['user_id'] = train['user_id'].unique()
Baseline
- ํต๊ณ๊ธฐ๋ฐ์ ๋ชจ๋ธ
books.sort_values(by='books_count', ascending=False)[0:5]
popular_rec_model = books.sort_values(by='books_count', ascending=False)['book_id'].values[0:500]
total_rec_list = {} for user in tqdm(rec_df['user_id'].unique()): rec_list = [] for rec in popular_rec_model[0:200]: rec_list.append(rec) total_rec_list[user] = rec_list
import six import math # https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py class evaluate(): def __init__(self, recs, gt, topn=100): self.recs = recs self.gt = gt self.topn = topn def _ndcg(self): Q, S = 0.0, 0.0 for u, seen in six.iteritems(self.gt): seen = list(set(seen)) rec = self.recs.get(u, []) if not rec or len(seen) == 0: continue dcg = 0.0 idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))]) for i, r in enumerate(rec): if r not in seen: continue rank = i + 1 dcg += 1.0 / math.log(rank + 1, 2) ndcg = dcg / idcg S += ndcg Q += 1 return S / Q def _map(self): n, ap = 0.0, 0.0 for u, seen in six.iteritems(self.gt): seen = list(set(seen)) rec = self.recs.get(u, []) if not rec or len(seen) == 0: continue _ap, correct = 0.0, 0.0 for i, r in enumerate(rec): if r in seen: correct += 1 _ap += (correct / (i + 1.0)) _ap /= min(len(seen), len(rec)) ap += _ap n += 1.0 return ap / n def _entropy_diversity(self): sz = float(len(self.recs)) * self.topn freq = {} for u, rec in six.iteritems(self.recs): for r in rec: freq[r] = freq.get(r, 0) + 1 ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)]) return ent def _evaluate(self): print('MAP@%s: %s' % (self.topn, self._map())) print('NDCG@%s: %s' % (self.topn, self._ndcg())) print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200) evaluate_func._evaluate()
Baseline ์์ฉ
- ๊ธ์ค์์ ํ์ ์ด ๋์ ๊ธ๋ค์ ์ฐ์ ์ ์ผ๋ก ์ถ์ฒ
- ์ข์ํ๋ ์๊ฐ์ ๊ธ์ ์ฐ์ ์ ์ผ๋ก ์ถ์ฒ
- ์ฅ๋ฐ๊ตฌ๋์ ๋ด๊ธด ๊ธ๊ณผ ์๊ฐ์ ๊ธ์ ์ฐ์ ์ ์ผ๋ก ์ถ์ฒ
- ์ฝ์ ๊ธ์ ์๋ฆฌ์ฆ๊ธ์ด ๋์ค๋ฉด ์ถ์ฒ
- ์ต์ ์ ๊ธ ์ถ์ฒ
train = pd.merge(train, books[['book_id', 'authors', 'ratings_count']], how='left', on='book_id')
agg = train.groupby(['user_id','authors'])['authors'].agg({'count'}).reset_index() agg = agg.sort_values(by='count', ascending=False) agg.head()
author_books = books[['book_id', 'authors', 'ratings_count']].sort_values(by=['authors', 'ratings_count'], ascending=[True, False]) author_books = author_books.reset_index(drop=True) author_books.head()
author_rec_model = agg.merge(author_books, how='left', on=['authors'])
author_rec_model.head()
author_rec_model[author_rec_model['user_id'] == 30944]['book_id'].values
total_rec_list = {} for user in tqdm(rec_df['user_id'].unique()): rec_list = [] author_rec_model_ = author_rec_model[author_rec_model['user_id'] == user]['book_id'].values for rec in author_rec_model_: rec_list.append(rec) if len(rec_list) < 200: for i in popular_rec_model[0:200]: rec_list.append(rec) total_rec_list[user] = rec_list[0:20
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200) evaluate_func._evaluate()
ํ์ฒ๋ฆฌ
- ์ฝ์ ์ฑ ์ ์ถ์ฒํด์ฃผ๋ฉด ์๋จ
- ์ฝ์ ์ธ์ด์ ๋ง๋ ์ฑ ์ ์ถ์ฒํด์ค์ผํจ
# ๋ด๊ฐ ์ฝ์ ์ฑ ์ ๋ชฉ๋ก์ ์ถ์ถ read_list = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index() read_list.head()
total_rec_list = {} for user in tqdm(rec_df['user_id'].unique()): rec_list = [] author_rec_model_ = author_rec_model[author_rec_model['user_id'] == user]['book_id'].values seen = read_list[read_list['user_id'] == user]['unique'].values[0] for rec in author_rec_model_: if rec not in seen: rec_list.append(rec) if len(rec_list) < 200: for i in popular_rec_model[0:200]: if rec not in seen: rec_list.append(rec) total_rec_list[user] = rec_list[0:200]
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200) evaluate_func._evaluate()
# ๋ด๊ฐ ์ฝ์ ์ ์๋ ์ธ์ด์ ๋ชฉ๋ก์ ์ถ์ถ ## User์ ๋ํ ๋ฉํ์ ๋ณด๊ฐ ์์ผ๋ฉด ์ฝ๊ฒ ์ถ์ถ๊ฐ๋ฅํ์ง๋ง, ํ์ฌ๋ ์์ผ๋ฏ๋ก ์ง์ ์์ฑ ## Ratings์์ ์ฝ์ ์ฑ ๋ค์ ์ธ์ด๋ฅผ ์ ๋ถ ์์งํด์ ํด๋น ์ธ์ด์ ์ฑ ๋ค์ ๊ฐ๋ฅํ ์ธ์ด๋ก ์ค์ language = pd.merge(train, books[['book_id', 'language_code']], how='left', on='book_id')
language_list = language.groupby(['user_id'])['language_code'].agg({'unique'}).reset_index() language_list.head()
ย
ย
ย
03. goodbooks-10k Collaborative Filtering Model
์ฌ๊ธฐ์๋ collaborative filtering model์ ์ฌ์ฉํ์ฌ ์ฌ์ฉ์๋ค์๊ฒ ์ถ์ฒ์ ํด์ฃผ๋ ์ฝ๋๋ฅผ ์์ฑํด ๋ณด๊ฒ ์ต๋๋ค.
#ํ์ํ ๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ถ๋ฌ์ค๊ธฐ import pandas as pd import numpy as np import plotnine from plotnine import * import os, sys, gc from tqdm.notebook import tqdm
#๊ฒฝ๋ก ์ค์ ํ๊ธฐ path = '../input/t-academy-recommendation2/books/'
#๊ฒฝ๋ก๋ก ๋ถํฐ csvํ์ผ ๊ฐ์ ธ์ค๊ธฐ books = pd.read_csv(path + "books.csv") book_tags = pd.read_csv(path + "book_tags.csv") train = pd.read_csv(path + "train.csv") test = pd.read_csv(path + "test.csv") tags = pd.read_csv(path + "tags.csv") to_read = pd.read_csv(path + "to_read.csv")
#book_id๊ฐ์ string์ผ๋ก ๋ฐ๊พธ๊ธฐ train['book_id'] = train['book_id'].astype(str) test['book_id'] = test['book_id'].astype(str) books['book_id'] = books['book_id'].astype(str)
์ด 23๊ฐ์ column์ผ๋ก ๊ตฌ์ฑ๋์ด ์์ง๋ง ๊ฐ๋
์ฑ์ ์ํด ๋ท๋ถ๋ถ์ ์๋ผ์ ๊ฐ์ ธ์์ต๋๋ค. ์ฌ๊ธฐ์ ์ถ์ฒ์ ์ํด books_count๋ก ์ ๋ ฌ๋ book_id๋ฅผ ์ถ์ถํ์์ต๋๋ค.
ย
popular_rec_model = books.sort_values(by='books_count', ascending=False)['book_id'].values[0:500]
ย
์ฌ์ฉ์์๊ฒ ์ ์ ํ ์ถ์ฒ์ ํด์ฃผ๊ธฐ ์ํด user_id๋ฅผ ๊ธฐ์ค์ผ๋ก ์ฝ์๋ book_id๋ค์ ๋ชจ๋ ๋ฌถ์ด์ค๋๋ค.
sol = test.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index() gt = {} for user in tqdm(sol['user_id'].unique()): gt[user] = list(sol[sol['user_id'] == user]['unique'].values[0])
ย
rec_df = pd.DataFrame() rec_df['user_id'] = train['user_id'].unique()
์ฌ๊ธฐ์ ๋ถํฐ๋ Matrix Factorization์ ๊ดํ ์ฝ๋์
๋๋ค. fit, cost, gradient, gradient_descent, get_prediction, get_complete_matrix ํจ์๋ก ์ด๋ฃจ์ด์ ธ ์์ต๋๋ค.
ย
fit: epoch๋งํผ ๋ฐ๋ณตํ๋ฉด์ cost๋ฅผ ๊ณ์ฐํ๊ณ gradient descent๋ฅผ ์งํํ๋ ํจ์
cost: ์์ธก๊ณผ ์ค์ ๊ฐ์ ์ค์ฐจ์ ๊ณฑํฉ์ ๊ตฌํ๋ ํจ์
gradient: ๊ธฐ์ธ๊ธฐ๋ฅผ ๊ตฌํ๋ ํจ์
gradient_descent: ๊ธฐ์ธ๊ธฐ์ learning_rate๋ฅผ ๊ฐ์ง๊ณ matrix๋ฅผ ๊ฐฑ์ ํ๋ ํจ์
get_prediction: ํ๋ ฌ๋ถํด๋ฅผ ํตํด ์ป์ ์์ธก๊ฐ์ ๊ตฌํ๋ ํจ์
ย
import numpy as np from tqdm import tqdm_notebook as tqdm import numpy as np # Base code : https://yamalab.tistory.com/92 class MatrixFactorization(): def __init__(self, R, k, learning_rate, reg_param, epochs, verbose=False): """ :param R: rating matrix :param k: latent parameter :param learning_rate: alpha on weight update :param reg_param: beta on weight update :param epochs: training epochs :param verbose: print status """ self._R = R self._num_users, self._num_items = R.shape self._k = k self._learning_rate = learning_rate self._reg_param = reg_param self._epochs = epochs self._verbose = verbose def fit(self): """ training Matrix Factorization : Update matrix latent weight and bias ์ฐธ๊ณ : self._b์ ๋ํ ์ค๋ช - global bias: input R์์ ํ๊ฐ๊ฐ ๋งค๊ฒจ์ง rating์ ํ๊ท ๊ฐ์ global bias๋ก ์ฌ์ฉ - ์ ๊ทํ ๊ธฐ๋ฅ. ์ต์ข rating์ ์์๊ฐ ๋ค์ด๊ฐ๋ ๊ฒ ๋์ latent feature์ ์์๊ฐ ํฌํจ๋๋๋ก ํด์ค. :return: training_process """ # init latent features self._P = np.random.normal(size=(self._num_users, self._k)) self._Q = np.random.normal(size=(self._num_items, self._k)) # init biases self._b_P = np.zeros(self._num_users) self._b_Q = np.zeros(self._num_items) self._b = np.mean(self._R[np.where(self._R != 0)]) # train while epochs self._training_process = [] for epoch in range(self._epochs): # rating์ด ์กด์ฌํ๋ index๋ฅผ ๊ธฐ์ค์ผ๋ก training xi, yi = self._R.nonzero() for i, j in zip(xi, yi): self.gradient_descent(i, j, self._R[i, j]) cost = self.cost() self._training_process.append((epoch, cost)) # print status if self._verbose == True and ((epoch + 1) % 10 == 0): print("Iteration: %d ; cost = %.4f" % (epoch + 1, cost)) def cost(self): """ compute root mean square error :return: rmse cost """ # xi, yi: R[xi, yi]๋ nonzero์ธ value๋ฅผ ์๋ฏธํ๋ค. # ์ฐธ๊ณ : http://codepractice.tistory.com/90 xi, yi = self._R.nonzero() # predicted = self.get_complete_matrix() cost = 0 for x, y in zip(xi, yi): cost += pow(self._R[x, y] - self.get_prediction(x, y), 2) return np.sqrt(cost/len(xi)) def gradient(self, error, i, j): """ gradient of latent feature for GD :param error: rating - prediction error :param i: user index :param j: item index :return: gradient of latent feature tuple """ dp = (error * self._Q[j, :]) - (self._reg_param * self._P[i, :]) dq = (error * self._P[i, :]) - (self._reg_param * self._Q[j, :]) return dp, dq def gradient_descent(self, i, j, rating): """ graident descent function :param i: user index of matrix :param j: item index of matrix :param rating: rating of (i,j) """ # get error prediction = self.get_prediction(i, j) error = rating - prediction # update biases self._b_P[i] += self._learning_rate * (error - self._reg_param * self._b_P[i]) self._b_Q[j] += self._learning_rate * (error - self._reg_param * self._b_Q[j]) # update latent feature dp, dq = self.gradient(error, i, j) self._P[i, :] += self._learning_rate * dp self._Q[j, :] += self._learning_rate * dq def get_prediction(self, i, j): """ get predicted rating: user_i, item_j :return: prediction of r_ij """ return self._b + self._b_P[i] + self._b_Q[j] + self._P[i, :].dot(self._Q[j, :].T) def get_complete_matrix(self): """ computer complete matrix PXQ + P.bias + Q.bias + global bias - PXQ ํ๋ ฌ์ b_P[:, np.newaxis]๋ฅผ ๋ํ๋ ๊ฒ์ ๊ฐ ์ด๋ง๋ค bias๋ฅผ ๋ํด์ฃผ๋ ๊ฒ - b_Q[np.newaxis:, ]๋ฅผ ๋ํ๋ ๊ฒ์ ๊ฐ ํ๋ง๋ค bias๋ฅผ ๋ํด์ฃผ๋ ๊ฒ - b๋ฅผ ๋ํ๋ ๊ฒ์ ๊ฐ element๋ง๋ค bias๋ฅผ ๋ํด์ฃผ๋ ๊ฒ - newaxis: ์ฐจ์์ ์ถ๊ฐํด์ค. 1์ฐจ์์ธ Latent๋ค๋ก 2์ฐจ์์ R์ ํ/์ด ๋จ์ ์ฐ์ฐ์ ํด์ฃผ๊ธฐ์ํด ์ฐจ์์ ์ถ๊ฐํ๋ ๊ฒ. :return: complete matrix R^ """ return self._b + self._b_P[:, np.newaxis] + self._b_Q[np.newaxis:, ] + self._P.dot(self._Q.T)
ย
์ ์ํ MatrixFactorization์ ์ฌ์ฉํ๋ ๊ณผ์ ์
๋๋ค.
%%time factorizer = MatrixFactorization(R, k=20, learning_rate=0.01, reg_param=0.01, epochs=100, verbose=True) factorizer.fit()
sgd_rec_model = factorizer.get_complete_matrix()
# ๋ด๊ฐ ์ฝ์ ์ฑ ์ ๋ชฉ๋ก์ ์ถ์ถ read_list = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index() read_list.head()
total_rec_list = {} for user in tqdm(data['useridx'].unique()): rec_list = [] # ๊ธฐ์กด์ ๋ง๋ Book ID๋ฅผ ๋ณ๊ฒฝ rating_scores = [(idx2book[i], c) for i, c in enumerate(sgd_rec_model[user]) if i != user] # ์๊ธฐ ์์ ์ด ์ถ์ฒ์๋๋๋ก rating_scores = sorted(rating_scores, key = lambda x: x[1], reverse=True) # ํ์ ์ด ๋์ ์์๋๋ก ์ ๋ ฌ seen = read_list[read_list['user_id'] == idx2user[user]]['unique'].values[0] for rec in rating_scores[0:250]: if rec[0] not in seen: rec_list.append(rec[0]) if len(rec_list) < 200: for i in popular_rec_model[0:200]: if rec not in seen: rec_list.append(rec) total_rec_list[idx2user[user]] = rec_list[0:200]
ย
Map, NDCG, entropy_diversity๋ผ๋ 3๊ฐ์ง ํ๊ฐํจ์์ ๋ํด ์ ์ํ ์ฝ๋์
๋๋ค.
import six import math # https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py class evaluate(): def __init__(self, recs, gt, topn=100): self.recs = recs self.gt = gt self.topn = topn def _ndcg(self): Q, S = 0.0, 0.0 for u, seen in six.iteritems(self.gt): seen = list(set(seen)) rec = self.recs.get(u, []) if not rec or len(seen) == 0: continue dcg = 0.0 idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))]) for i, r in enumerate(rec): if r not in seen: continue rank = i + 1 dcg += 1.0 / math.log(rank + 1, 2) ndcg = dcg / idcg S += ndcg Q += 1 return S / Q def _map(self): n, ap = 0.0, 0.0 for u, seen in six.iteritems(self.gt): seen = list(set(seen)) rec = self.recs.get(u, []) if not rec or len(seen) == 0: continue _ap, correct = 0.0, 0.0 for i, r in enumerate(rec): if r in seen: correct += 1 _ap += (correct / (i + 1.0)) _ap /= min(len(seen), len(rec)) ap += _ap n += 1.0 return ap / n def _entropy_diversity(self): sz = float(len(self.recs)) * self.topn freq = {} for u, rec in six.iteritems(self.recs): for r in rec: freq[r] = freq.get(r, 0) + 1 ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)]) return ent def _evaluate(self): print('MAP@%s: %s' % (self.topn, self._map())) print('NDCG@%s: %s' % (self.topn, self._ndcg())) print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200) evaluate_func._evaluate()
ย
ย
ALS๋ฅผ ์ฌ์ฉํ์ฌ ์ฝ๋๋ฅผ ๊ตฌํํ์์ต๋๋ค.
from implicit.evaluation import * from implicit.als import AlternatingLeastSquares as ALS from implicit.bpr import BayesianPersonalizedRanking as BPR als_model = ALS(factors=20, regularization=0.01, iterations = 100) als_model.fit(purchase_sparse.T)
als_model.recommend(0, purchase_sparse, N=200)[0:10]
total_rec_list = {} for user in tqdm(data['useridx'].unique()): rec_list = [] # ๊ธฐ์กด์ ๋ง๋ Book ID๋ฅผ ๋ณ๊ฒฝ seen = read_list[read_list['user_id'] == idx2user[user]]['unique'].values[0] recs = als_model.recommend(user, purchase_sparse, N=250) recs = [idx2book[x[0]] for x in recs][0:250] for rec in recs: if rec not in seen: rec_list.append(rec) if len(rec_list) < 200: for i in popular_rec_model[0:200]: if rec not in seen: rec_list.append(rec) total_rec_list[idx2user[user]] = rec_list[0:200]
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200) evaluate_func._evaluate()