๐Ÿ“š

6๊ฐ• - Goodbooks-10k ๋ฐ์ดํ„ฐ๋ฅผ ์ด์šฉํ•œ ์ถ”์ฒœ์‹œ์Šคํ…œ ์‹ค์Šต

ย 
ย 

01. goodbooks-10k Data Exploratory Analysis

ย 
๋ฒ ์ด์Šค๋ผ์ธ์„ ๋งŒ๋“ค์–ด ์ฃผ๊ธฐ ์ „์— ๋ฐ์ดํ„ฐ ํƒ์ƒ‰์ด ํ•„์š”ํ•œ๋ฐ, ์ฒ˜์Œ ์ถ”์ฒœ์‹œ์Šคํ…œ์„ ๋งŒ๋“ค ๋•Œ ์‚ฌ์šฉ์ž์˜ ํƒ€๊ฒŸ ๊ตฐ์„ ์ •ํ•ด์•ผ ํ•˜๊ณ  ์–ด๋–ค ์ƒํ’ˆ์„ ์ถ”์ฒœํ•ด์ค„์ง€ ์ƒํ’ˆ๊ณผ ์‚ฌ์šฉ์ž์˜ ํŠน์„ฑ์„ ํŒŒ์•…ํ•˜๋Š” ๊ฒƒ์ด ์ค‘์š”ํ•˜๋‹ค. ์ด๋•Œ ํ™œ์šฉํ•  ์ˆ˜ ์žˆ๋Š” ๊ฒƒ์ด ๋ฐ”๋กœ ๋ฐ์ดํ„ฐ ํƒ์ƒ‰์ด๋‹ค.
# ํŒจํ‚ค์ง€ ๋กœ๋“œ import pandas as pd import numpy as np import plotnine from plotnine import * import os, sys, gc from tqdm.notebook import tqdm # ๊ฒฝ๋กœ ์„ค์ • - ๊ธฐ์กด ๊ฐ•์˜ ์„ค์ • path = '../input/t-academy-recommendation2/books/' # ๊ฒฝ๋กœ ์„ค์ • - ์ฟ ํ‚ค's drive.mount from google.colab import drive drive.mount('/content/drive')
์ด๋•Œ
books.csv : ์ฑ…์˜ ๋ฉ”ํƒ€์ •๋ณด
book_tags.csv : ์ฑ…-ํƒœ๊ทธ์˜ ๋งคํ•‘์ •๋ณด
ratings.csv : ์‚ฌ์šฉ์ž๊ฐ€ ์ฑ…์— ๋Œ€ํ•ด ์ ์ˆ˜๋ฅผ ์ค€ ํ‰์ ์ •๋ณด
tags.csv : ํƒœ๊ทธ์˜ ์ •๋ณด
to_read.csv :์‚ฌ์šฉ์ž๊ฐ€ ์ฝ์œผ๋ ค๊ณ  ๊ธฐ๋กํ•ด๋‘” ์ฑ… ์ •๋ณด
ย 
# ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ books = pd.read_csv(path + "books.csv") book_tags = pd.read_csv(path + "books_tags.csv") ratings = pd.read_csv(path + "ratings.csv") tags = pd,read_csv(path + "tags.csv") to_read = pd.read_csv(path + "to_read.csv") # ์ฑ…์˜ ๋ฉ”ํƒ€์ •๋ณด books.head() books.columns books['small_image_url].values[0]
์ด๋•Œ ์ฑ…์˜ ๊ฒฝ์šฐ์—๋Š” ๋‚ด์šฉ ์™ธ์— ํ‘œ์ง€๋„ ์ค‘์š”ํ•œ ์˜ํ–ฅ์„ ๋ผ์น˜๊ธฐ ๋•Œ๋ฌธ์—, ํ•ด๋‹น ์ฑ…์˜ ํ‘œ์ง€์—์„œ ์ด๋ฏธ์ง€์˜ ํŠน์ง•์„ ์ถ”์ถœํ•œ ๋’ค CNN ๊ฐ™์€ ๋ชจ๋ธ๋กœ ์œ ์‚ฌํ•œ ์ฑ…์„ ์ฐพ๋Š” Contents Based Recommendation๋„ ๊ฐ€๋Šฅํ•˜๋‹ค.
ย 
# ์ด ์ค‘์—์„œ ํ•„์š”ํ•œ ๋ณ€์ˆ˜๋“ค๋งŒ ์‚ฌ์šฉ books = books[['book_id', 'authors', 'title', 'ratings_count', 'average_rating', 'language_code']].reset_index(drop=True) # plotnine ## R : ggplot -> python : Plotnine ### R์„ ํ™œ์šฉํ•œ plotnine ์ฝ”๋“œ #### ๊ฐ ์ €์ž๋ณ„๋กœ ๋ช‡ ๊ถŒ์˜ ์ฑ…์„ ์ผ๋Š”๊ฐ€ count agg = books.groupby('authors')['authors'].agg({'count'}) ##### ์ €์ž๋งˆ๋‹ค ์ด 1๊ถŒ~60๊ถŒ์˜ ์ฑ…์„ ์“ฐ๋ฉฐ, ggplot์€ ๋ฐ์ดํ„ฐ ๊ตฌ์กฐ๊ฐ€ ๋ช…ํ™•ํ•˜๊ธฐ์— ์ด๋ฅผ ํ†ตํ•ด ์–ด๋–ค ๋ฐ์ดํ„ฐ๋ฅผ ์‚ฌ์šฉํ• ์ง€ ์ž…๋ ฅ์„ ๋ฐ›๊ณ , histogram์œผ๋กœ column์„ ํ‘œํ˜„ํ•  ์ˆ˜ ์žˆ์Œ (ggplot(data = agg) + geom_histogram(aes(x='count'), binwidth = 1, fill = '#49beb7') + labs(title = "Number of the Author's Book", x = 'Book Count', y = 'Author Count')) + theme_light() + theme( axis_text_x = element_text(color='black'), axis_text_y = element_text(color='black'), axis_line = element_line(color='black'), axis_ticks = element_line(color='grey'), figure_size = (10,6) ) # ์ฑ…์˜ ๋ฉ”ํƒ€์ •๋ณด ์˜๋ฏธ print("์ฑ…์˜ ์ˆซ์ž:", books['book_id'].nunique()) print("์ €์ž์˜ ์ˆซ์ž:", books['authors'], nunique(), "|n") print(pd.DataFrame(agg['count'].describe()).T) # ์ €์ž ์ฐพ๊ธฐ agg.sort_values(by.'count', ascending=False) # ์ฑ…์˜ ํ‰๊ท ํ‰์  (ggplot(data = books) + geom_histogram(aes(x='average_rating'), binwidth = 0.1, fill = '#49beb7') + labs(title = "Average Rating of the Books", x = 'Average Rating', y = 'Book Count')) + theme_light() + theme( axis_text_x = element_text(color='black'), axis_text_y = element_text(color='black'), axis_line = element_line(color='black'), axis_ticks = element_line(color='grey'), figure_size = (10,6) )
ย 
ํ‰๊ท ํ‰์ ์„ ๋ณด๋ฉด, 3์  ์ดํ•˜์˜ ํ‰์ ์ธ ์ฑ…์ด 14๊ถŒ ๋‚˜์˜ค๋Š”๋ฐ ์ด๋Ÿฐ ์ฑ…๋“ค์€ ์œ ์‚ฌ๋„๊ฐ€ ์•„๋ฌด๋ฆฌ ๋†’๋‹ค๊ณ  ํ•˜๋”๋ผ๋„ ์ถ”์ฒœ์„ ์•ˆํ•˜๋Š” ๊ฒƒ์ด ํ•™์Šต์— ์œ ๋ฆฌํ•  ์ˆ˜ ์žˆ๋‹ค. ๋”ฐ๋ผ์„œ ์ด๋Ÿฌํ•œ ์ฑ…๋“ค์€ ์ถ”์ฒœํ•ด์ค„๋งŒํ•œ ์ด์œ ๊ฐ€ ์žˆ๋Š”์ง€ ์‚ดํŽด๋ณด๊ณ , ๋ณดํ†ต์€ ์ถ”์ฒœํ•˜์ง€ ์•Š๋Š” ๋ฐฉํ–ฅ์œผ๋กœ ๊ฐ€๊ฒŒ ๋œ๋‹ค. ํ•ด๋‹น ๋‚ด์šฉ์€ ์•„๋ž˜์™€ ๊ฐ™๋‹ค.
# ์ถ”์ฒœํ•  ํ•„์š”๊ฐ€ ์—†๋Š” ์ฑ… ์ฐพ๊ธฐ books[books['average_rating'] <== 3].shape[0] # ์ฑ… ์ฐพ๊ธฐ books.sort_values(by='average_rating', ascending=False).head() # ๋†’์€ ํ‰์ ๋Œ€๋กœ ์ถ”์ฒœํ•˜๊ธฐ (ggplot(data = books) + geom_histogram(aes(x='rating_count'), binwidth = 10000, fill = '#49beb7') + labs(title = "Rating Count of the Books", x = 'Rating Count', y = 'Book Count')) + theme_light() + theme( axis_text_x = element_text(color='black'), axis_text_y = element_text(color='black'), axis_line = element_line(color='black'), axis_ticks = element_line(color='grey'), figure_size = (10,6) )
ย 
ํ‰์ ์ด ๋†’์€ ์ฑ…๋“ค์„ ์šฐ์„ ์ ์œผ๋กœ ์ถ”์ฒœํ•ด์ฃผ๋Š” ๊ฒƒ์ด ์ข‹์„ ๋•Œ๋„ ์žˆ์œผ๋‚˜, ํ‰์ ์ด ๋†’๋‹ค๊ณ  ํ•˜๋”๋ผ๋„ ์‚ฌ๋žŒ๋“ค์ด ์˜คํžˆ๋ ค ๋งŽ์ด ๋ณด์ง€ ์•Š์€ ์ฑ…, ์ฆ‰ ๋ชจ์ˆ˜ ์ž์ฒด๊ฐ€ ์ ์€ ์ฑ…์ผ ์ˆ˜๋„ ์žˆ๊ธฐ ๋•Œ๋ฌธ์— ํ•ญ์ƒ ํ‰์ ์ด ๋†’์€ ์ฑ…๋“ค๋งŒ์„ ์ถ”์ฒœํ•ด์ฃผ๋Š” ๊ฒƒ์€ ์˜ณ์ง€ ์•Š๋‹ค.
pd.DataFrame(books['ratings_count'].describe()).T # ๋ฐ์ดํ„ฐ ํƒ์ƒ‰ ๊ณผ์ •์—์„œ ratings_count์™€ average_rating์„ ๋น„๊ต books.sort_values(by='rating_count', ascending=False).head() # ์ฑ…์˜ language ์ฝ”๋“œ์ •๋ณด agg = pd.DataFrame(books['language_code'].value_counts()).reset_index() agg.columns = ['language_code', 'count'] (ggplot(data = books) + geom_histogram(aes(x='language_code'), binwidth = 10000, fill = '#49beb7') + labs(title = "Rating Count of the Books", x = 'Rating Count', y = 'Book Count')) + theme_light() + theme( axis_text_x = element_text(color='black'), axis_text_y = element_text(color='black'), axis_line = element_line(color='black'), axis_ticks = element_line(color='grey'), figure_size = (10,6)
๋ฉ”ํƒ€ ๋ฐ์ดํ„ฐ๋กœ ์ธํ•ด ๋ฐ์ดํ„ฐ ๊ฒฐ์ธก์น˜์™€ ๋ถ€์ •ํ™•ํ•œ ๋ฐ์ดํ„ฐ๋“ค์ด ์ข…์ข… ๋ณด์ด๋Š” ํ˜„์ƒ์ด ์ƒ๊ธด๋‹ค.
ย 
๊ตญ์ ์— ๋งž๋Š” ์–ธ์–ด๋กœ ๋œ ์ฑ…์„ ์ถ”์ฒœํ•ด์ฃผ๋Š” ๊ฒƒ์ด ํ•„์š”ํ•˜๋‹ค.
books['language_code'].unique() books.isnull().sum() # ์‚ฌ์šฉ์ž๊ฐ€ ์ฑ…์— ๋Œ€ํ•ด ๋‚ด๋ฆฐ ํ‰์  ratings # ratings์—๋Š” ์žˆ์ง€๋งŒ books์—๋Š” ์—†๋Š” ์ฑ…์˜ id์˜ ์ˆ˜ ๊ณ„์‚ฐ ## set๊ณผ set์˜ difference length ๊ณ„์‚ฐ, ์ฆ‰ ์ฐจ์ง‘ํ•ฉ์˜ ์ „์ฒด ๊ธธ์ด ๊ณ„์‚ฐ len(set(ratings['book_id'].unique()).difference(set(books['book_id'].unique())))
10,000๊ฐœ์˜ ๋ฐ์ดํ„ฐ ์ค‘ 9,188๊ฐœ๊ฐ€ ๋ฉ”ํƒ€์ •๋ณด๊ฐ€ ์—†๋Š” ๊ฒƒ์œผ๋กœ ๋‚˜ํƒ€๋‚ฌ๋‹ค. ์ฆ‰, ์‚ฌ์šฉ์ž๊ฐ€ ์‹ค์ œ๋กœ ํ‰์ ์„ ๋ถ€์—ฌํ–ˆ์œผ๋‚˜ ๋ฉ”ํƒ€์ •๋ณด์— ์žˆ๋Š” ์ฑ…์€ 812๊ถŒ๋ฐ–์— ๋˜์ง€ ์•Š๋Š” ๊ฒƒ์ด๋‹ค. ๋”ฐ๋ผ์„œ ์ด๋Ÿฌํ•œ ๋‚ด์šฉ์„ ์ข…ํ•ฉํ•ด๋ดค์„ ๋•Œ, ์ปจํ…์ธ  ๊ธฐ๋ฐ˜ ์ถ”์ฒœ์‹œ์Šคํ…œ์€ ์ •ํ™•๋„๊ฐ€ ์ •ํ™•ํ•˜์ง€ ์•Š์•„ ์„ฑ๋Šฅ์ด ์ข‹์ง€ ์•Š์„ ๊ฒƒ์ด๋ผ๊ณ  ์˜ˆ์ƒํ•  ์ˆ˜ ์žˆ๋‹ค. ๋”ฐ๋ผ์„œ ํ˜‘์—… ํ•„ํ„ฐ๋ง์ด๋‚˜ ํ†ต๊ณ„ ๊ธฐ๋ฐ˜ ๋ชจ๋ธ์„ ํ™œ์šฉํ•˜๋Š” ๊ฒƒ์ด ์œ ๋ฆฌํ•˜๋‹ค.
ย 

ย 

book์˜ tag ์ •๋ณด ํ™•์ธํ•˜๊ธฐ

์ด๋•Œ
book_tags : book_id์— ๋งคํ•‘๋œ tag_id์˜ ์ •๋ณด
tags : tag_id์™€ tag_name์— ๋Œ€ํ•œ ๋งคํ•‘์ •๋ณด
ย 
# ํƒœ๊ทธ ์ •๋ณด ์‚ดํŽด๋ณด๊ธฐ book_tags.head() # tag_id์™€ ๋งคํ•‘๋˜๋Š” tag_name์— ๋Œ€ํ•œ ์ •๋ณด๋ฅผ ๋‹ด๊ณ  ์žˆ๋Š” ํ…Œ์ด๋ธ” tags.head() # tag_id ๊ธฐ์ค€์œผ๋กœ column ๋งคํ•‘ํ•˜๊ธฐ book_tags = pd.merge(tags, book_tags, how='left', on='tag_id') book_tags agg = book_tags.groupby(['tag_name'])['count'].agg({'sum}).reset_index() agg = agg.sort_values(by='sum', ascending=False).reset_index(drop=True) agg.head() # ์ƒ์œ„ 20๊ฐœ tag ์ฐ๊ธฐ (ggplot(data = agg.loc[0:20]) + geom_bar(aes(x='tag_name', y='sum'), fill = '#49beb7', stat = "identity") + labs(title = "Top 20: Tag Count", x = 'Tag', y = 'Tag Count')) + theme_light() + theme( axis_text_x = element_text(color='black', rotation=60), axis_text_y = element_text(color='black'), axis_line = element_line(color='black'), axis_ticks = element_line(color='grey'), figure_size = (10,6)) )
์ด๋•Œ to_read ํƒœ๊ทธ ๋ชจ๋ธ์ด ๋‹ค๋ฅธ ํƒœ๊ทธ์— ๋น„ํ•ด์„œ ๋งค์šฐ ๋†’๊ฒŒ ๋‚˜์˜ค๋Š” ๊ฒƒ์„ ๋ณผ ์ˆ˜ ์žˆ๋‹ค. ์ด๋ฅผ ํ†ตํ•ด ๋งˆ์น˜ TF-IDF์—์„œ ๋„ˆ๋ฌด ๋งŽ์ด ๋‚˜์˜ค๋Š” ๋ชจ๋ธ์„ ์ œ๊ฑฐํ•˜๋Š” ๊ฒƒ์ฒ˜๋Ÿผ, to_read ํƒœ๊ทธ๋ฅผ ์ œ๊ฑฐํ•˜๊ณ  ๋‹ค๋ฅธ ๊ฒƒ๋ผ๋ฆฌ ๋น„๊ตํ•˜๋Š” ๊ฒƒ๋„ ํ•˜๋‚˜์˜ ๋ฐฉ๋ฒ•์ด๋‹ค.
ย 
# tag_count๊ฐ€ ๋ช‡ ๋ฒˆ ๋‚˜์™”๋Š”์ง€ ๊ธฐ์ˆ ํ†ต๊ณ„๊ฐ’ ๊ตฌํ•˜๊ธฐ pd.DataFrame(agg['sum'].describe()).T
์ฑ…๋“ค ๊ฐ„์˜ ํŽธ์ฐจ๊ฐ€ ๋งค์šฐ ํฐ ๊ฒƒ์„ ๋ณผ ์ˆ˜ ์žˆ๊ณ , min์€ ๋งค์šฐ ์ž‘๊ณ  max๋Š” ๋งค์šฐ ํฐ ์ƒํ™ฉ์ด๋‹ค. ์ด๋Ÿฌํ•œ ํƒœ๊ทธ์ •๋ณด๋“ค์„ ํ†ตํ•ด์„œ ๋‚ด๊ฐ€ ๋ณด๋Š” ํƒœ๊ทธ์˜ ๊ธ€์„ ์„ฑํ–ฅํŒŒ์•…์„ ํ†ตํ•ด ์ถ”์ฒœํ•ด์ฃผ๋Š” ๊ฒƒ์ด ํ•„์š”ํ•˜๊ณ , tag๊ฐ€ ์œ ์‚ฌํ•œ ์ฑ…๋“ค์„ ์ถ”์ฒœํ•ด์ฃผ๋Š” ๊ฒƒ๋„ ํ•„์š”ํ•˜๋‹ค.
ย 

ย 

ratings ํ‰์  ์ •๋ณด ํ™•์ธํ•˜๊ธฐ

  • ์ „์ฒด ์ฑ…๊ณผ ์‚ฌ์šฉ์ž์— ๋Œ€ํ•ด ๊ธฐ์ˆ  ํ†ต๊ณ„
# ratings ํ‰์  ์ •๋ณด ํ™•์ธ # Average Number of the Read Count ๊ด€๋ จ ๊ทธ๋ž˜ํ”„ agg = ratings.groupby(['user_id'])['book_id'].agg({'count'}).reset_index() (ggplot(data = agg) + geom_histogram(aes(x='count'), binwidth = 5, fill = '#49beb7') + labs(title = 'Average Number of the Read Count', x = 'Read Count', y = 'User Count') + theme_light() + theme( axis_text_x = element_text(color='black'), axis_text_y = element_text(color='black'), axis_line=element_line(color="black"), axis_ticks=element_line(color = "grey"), figure_size=(10,6)) )
pd.DataFrame(agg['count'].describe()).T
  • ํ‰๊ท  ํ•œ ์‚ฌ๋žŒ์ด ์ฝ๋Š” ์ฑ…์˜ ์ˆ˜๋Š” 18๊ถŒ
  • ์ตœ์†Œ 2๊ถŒ์”ฉ์€ ๊ตฌ๋งคํ•ด์„œ ์ฝ์Œ
  • ์ตœ๋Œ€ ๋งŽ์ด ์ฝ์€ ์‚ฌ๋žŒ์€ 200๊ถŒ์˜ ์ฑ…์„ ๊ตฌ๋งคํ•จ
# Average Readed Count ๊ด€๋ จ ๊ทธ๋ž˜ํ”„ agg = ratings.groupby(['book_id'])['book_id'].agg({'count'}).reset_index() (ggplot(data=agg) + geom_histogram(aes(x='count', y='stat(count)'), fill = '#49beb7', binwidth=5) + theme_minimal() + ggtitle("Average Readed Count") + labs(x="Readed Count", y="binwidth") + theme( axis_text_x = element_text(angle=60, color='black'), axis_text_y = element_text(color='black'), axis_line=element_line(color="black"), axis_ticks=element_line(color = "grey"), figure_size=(8,4)) )
pd.DataFrame(agg['count'].describe()).T
  • ์ฑ…์˜ ๊ฒฝ์šฐ ์ตœ์†Œ 8๋ช…์€ ์ฝ๊ณ , ๋งŽ์ด ์ฝํžŒ ์ฑ…์˜ ๊ฒฝ์šฐ 100๋ช…์ด ์ฝ์Œ
  • ํŽธ์ฐจ๋Š” ํฌ์ง€๋งŒ, ํ‰์ ์ด ๋ถ€์—ฌ๋œ ์ฑ…๋“ค์˜ ๋Œ€๋ถ€๋ถ„์ด 100๋ช…์”ฉ์€ ์ฝ์€ ์ฑ…๋“ค๋งŒ ๋ฝ‘ํžŒ ๊ฒƒ์„ ๋ณผ ์ˆ˜ ์žˆ์Œ
agg.head() books[books['book_id'].isin([1, 2, 3, 4, 5, 6, 7, 8])].head() ratings['user_id'].unique() ratings[(ratings['user_id'] == 314) & (ratings['book_id'].isin([1,2,3,4,5,6,7,8]))]
# ratings๋ฅผ 8๊ฐœ ์ „๋ถ€ ๋‹ค ๊ตฌ๋งคํ•œ ์‚ฌ๋žŒ agg = ratings[ratings['book_id'].isin([1,2,3,4,5,6,7,8])].groupby(['user_id'])['book_id'].agg({'nunique'}) agg = agg.reset_index() agg = agg.groupby(['nunique'])['user_id'].agg({'count'}).reset_index() (ggplot(data = agg) + geom_bar(aes(x='nunique', y='count'), fill = '#49beb7', stat = "identity") + labs(title = "Harry Poter's Reading Count", x = 'Series Count', y = 'Reaing Person Count') + theme_light() + theme( axis_text_x = element_text(color='black'), axis_text_y = element_text(color='black'), axis_line=element_line(color="black"), axis_ticks=element_line(color = "grey"), figure_size=(10,6)) )
agg['ratio'] = agg['count'] / agg['count'].sum() agg[['nunique', 'ratio']].T
  • ํ•ด๋ฆฌํฌํ„ฐ์™€ ๊ฐ™์ด ์‹œ๋ฆฌ์ฆˆ์„ฑ์˜ ๊ธ€๋“ค์€ ๊ฐ™์ด ์ฝ๋Š” ๊ฒฝํ–ฅ์ด ์žˆ์Œ

to read ์ •๋ณด

to_read.head() to_read['user_id'].nunique()
  • ์ด๋ฏธ ์ฝ์€ ์ •๋ณด๋ฟ๋งŒ ์•„๋‹ˆ๋ผ ์ฝ์„ ์ฑ…๋“ค์— ๋Œ€ํ•œ ์ •๋ณด๋„ ๊ฒฐํ•ฉํ•ด์„œ ์ถ”์ฒœ์ด ๊ฐ€๋Šฅ

ํ•™์Šต์…‹๊ณผ ๊ฒ€์ฆ์…‹ ์ƒ์„ฑ

agg = ratings.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index() agg.head()
agg['train'] = agg['unique'].apply(lambda x: np.random.choice(x, len(x)//2)) agg.head()
test = [] for i in tqdm(range(0, agg.shape[0])): test_rec = list(set(agg.loc[i, 'unique']).difference(set(agg.loc[i, 'train']))) test.append(test_rec) agg['test'] = test
# train dataset train_map = agg[['user_id', 'train']] # unnest tags train_map_unnest = np.dstack( ( np.repeat(train_map.user_id.values, list(map(len, train_map.train))), np.concatenate(train_map.train.values) ) ) train_map_unnest = pd.DataFrame(data = train_map_unnest[0], columns = train_map.columns) train_map_unnest.head()
# test dataset test_map = agg[['user_id', 'test']] # unnest tags test_map_unnest = np.dstack( ( np.repeat(test_map.user_id.values, list(map(len, test_map.test))), np.concatenate(test_map.test.values) ) ) test_map_unnest = pd.DataFrame(data = test_map_unnest[0], columns = test_map.columns) test_map_unnest.head()
train_map_unnest.columns = ['user_id', 'book_id'] test_map_unnest.columns = ['user_id', 'book_id'] train_map_unnest.to_csv("train.csv", index=False) test_map_unnest.to_csv("test.csv", index=False)
  • ์ „์ฒด ์ฑ…์„ ๊ตฌ๋งคํ•œ ์‚ฌ์šฉ์ž๋Š” 53424๋ช…์ด๊ณ  ์ฑ…์˜ ๊ฐœ์ˆ˜๋Š” 10000๊ฐœ
  • ๊ทธ ์ค‘ 48871๋ช…์ด ์žฅ๋ฐ”๊ตฌ๋‹ˆ์— ์ฑ…์„ ์ถ”๊ฐ€ํ•จ
  • ํ‰๊ท ์ ์œผ๋กœ ์ž‘๊ฐ€๋‹น ์ฑ…์ด 2๊ถŒ ์ด์ƒ
ย 
ย 

02. goodbooks-10k Baseline Model

import pandas as pd import numpy as np import plotnine from plotnine import * import os, sys, gc from tqdm.notebook import tqdm
path = '../input/t-academy-recommendation2/book
  • books.csv : ์ฑ…์˜ ๋ฉ”ํƒ€์ •๋ณด
  • book_tags.csv : ์ฑ…-ํ…Œํฌ์˜ ๋งคํ•‘์ •๋ณด
  • ratings.csv : ์‚ฌ์šฉ์ž๊ฐ€ ์ฑ…์— ๋Œ€ํ•ด ์ ์ˆ˜๋ฅผ ์ค€ ํ‰์  ์ •๋ณด
  • tags.csv : ํ…Œํฌ์˜ ์ •๋ณด
  • to_read.csv : ์‚ฌ์šฉ์ž๊ฐ€ ์ฝ์œผ๋ ค๊ณ  ๊ธฐ๋กํ•ด๋‘” ์ฑ…(์žฅ๋ฐ”๊ตฌ๋‹ˆ)
books = pd.read_csv(path + "books.csv") book_tags = pd.read_csv(path + "book_tags.csv") train = pd.read_csv(path + "train.csv") test = pd.read_csv(path + "test.csv") tags = pd.read_csv(path + "tags.csv") to_read = pd.read_csv(path + "to_read.csv")
train['book_id'] = train['book_id'].astype(str) test['book_id'] = test['book_id'].astype(str) books['book_id'] = books['book_id'].astype(str)
sol = test.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index() gt = {} for user in tqdm(sol['user_id'].unique()): gt[user] = list(sol[sol['user_id'] == user]['unique'].values[0])

ํ•œ ์‚ฌ๋žŒ๋‹น 100๊ถŒ์˜ ์ฑ…์„ ์ถ”์ฒœํ•ด์ฃผ๋Š” ์ƒํ™ฉ

rec_df = pd.DataFrame() rec_df['user_id'] = train['user_id'].unique()

Baseline

  • ํ†ต๊ณ„๊ธฐ๋ฐ˜์˜ ๋ชจ๋ธ
books.sort_values(by='books_count', ascending=False)[0:5]
popular_rec_model = books.sort_values(by='books_count', ascending=False)['book_id'].values[0:500]
total_rec_list = {} for user in tqdm(rec_df['user_id'].unique()): rec_list = [] for rec in popular_rec_model[0:200]: rec_list.append(rec) total_rec_list[user] = rec_list
import six import math # https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py class evaluate(): def __init__(self, recs, gt, topn=100): self.recs = recs self.gt = gt self.topn = topn def _ndcg(self): Q, S = 0.0, 0.0 for u, seen in six.iteritems(self.gt): seen = list(set(seen)) rec = self.recs.get(u, []) if not rec or len(seen) == 0: continue dcg = 0.0 idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))]) for i, r in enumerate(rec): if r not in seen: continue rank = i + 1 dcg += 1.0 / math.log(rank + 1, 2) ndcg = dcg / idcg S += ndcg Q += 1 return S / Q def _map(self): n, ap = 0.0, 0.0 for u, seen in six.iteritems(self.gt): seen = list(set(seen)) rec = self.recs.get(u, []) if not rec or len(seen) == 0: continue _ap, correct = 0.0, 0.0 for i, r in enumerate(rec): if r in seen: correct += 1 _ap += (correct / (i + 1.0)) _ap /= min(len(seen), len(rec)) ap += _ap n += 1.0 return ap / n def _entropy_diversity(self): sz = float(len(self.recs)) * self.topn freq = {} for u, rec in six.iteritems(self.recs): for r in rec: freq[r] = freq.get(r, 0) + 1 ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)]) return ent def _evaluate(self): print('MAP@%s: %s' % (self.topn, self._map())) print('NDCG@%s: %s' % (self.topn, self._ndcg())) print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200) evaluate_func._evaluate()

Baseline ์‘์šฉ

  • ๊ธ€์ค‘์—์„œ ํ‰์ ์ด ๋†’์€ ๊ธ€๋“ค์„ ์šฐ์„ ์ ์œผ๋กœ ์ถ”์ฒœ
  • ์ข‹์•„ํ•˜๋Š” ์ž‘๊ฐ€์˜ ๊ธ€์„ ์šฐ์„ ์ ์œผ๋กœ ์ถ”์ฒœ
  • ์žฅ๋ฐ”๊ตฌ๋‹ˆ์— ๋‹ด๊ธด ๊ธ€๊ณผ ์ž‘๊ฐ€์˜ ๊ธ€์„ ์šฐ์„ ์ ์œผ๋กœ ์ถ”์ฒœ
  • ์ฝ์€ ๊ธ€์˜ ์‹œ๋ฆฌ์ฆˆ๊ธ€์ด ๋‚˜์˜ค๋ฉด ์ถ”์ฒœ
  • ์ตœ์‹ ์˜ ๊ธ€ ์ถ”์ฒœ
train = pd.merge(train, books[['book_id', 'authors', 'ratings_count']], how='left', on='book_id')
agg = train.groupby(['user_id','authors'])['authors'].agg({'count'}).reset_index() agg = agg.sort_values(by='count', ascending=False) agg.head()
author_books = books[['book_id', 'authors', 'ratings_count']].sort_values(by=['authors', 'ratings_count'], ascending=[True, False]) author_books = author_books.reset_index(drop=True) author_books.head()
author_rec_model = agg.merge(author_books, how='left', on=['authors'])
author_rec_model.head()
author_rec_model[author_rec_model['user_id'] == 30944]['book_id'].values
total_rec_list = {} for user in tqdm(rec_df['user_id'].unique()): rec_list = [] author_rec_model_ = author_rec_model[author_rec_model['user_id'] == user]['book_id'].values for rec in author_rec_model_: rec_list.append(rec) if len(rec_list) < 200: for i in popular_rec_model[0:200]: rec_list.append(rec) total_rec_list[user] = rec_list[0:20
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200) evaluate_func._evaluate()

ํ›„์ฒ˜๋ฆฌ

  • ์ฝ์€ ์ฑ…์€ ์ถ”์ฒœํ•ด์ฃผ๋ฉด ์•ˆ๋จ
  • ์ฝ์€ ์–ธ์–ด์™€ ๋งž๋Š” ์ฑ…์„ ์ถ”์ฒœํ•ด์ค˜์•ผํ•จ
# ๋‚ด๊ฐ€ ์ฝ์€ ์ฑ…์˜ ๋ชฉ๋ก์„ ์ถ”์ถœ read_list = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index() read_list.head()
total_rec_list = {} for user in tqdm(rec_df['user_id'].unique()): rec_list = [] author_rec_model_ = author_rec_model[author_rec_model['user_id'] == user]['book_id'].values seen = read_list[read_list['user_id'] == user]['unique'].values[0] for rec in author_rec_model_: if rec not in seen: rec_list.append(rec) if len(rec_list) < 200: for i in popular_rec_model[0:200]: if rec not in seen: rec_list.append(rec) total_rec_list[user] = rec_list[0:200]
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200) evaluate_func._evaluate()
# ๋‚ด๊ฐ€ ์ฝ์„ ์ˆ˜ ์žˆ๋Š” ์–ธ์–ด์˜ ๋ชฉ๋ก์„ ์ถ”์ถœ ## User์— ๋Œ€ํ•œ ๋ฉ”ํƒ€์ •๋ณด๊ฐ€ ์žˆ์œผ๋ฉด ์‰ฝ๊ฒŒ ์ถ”์ถœ๊ฐ€๋Šฅํ•˜์ง€๋งŒ, ํ˜„์žฌ๋Š” ์—†์œผ๋ฏ€๋กœ ์ง์ ‘ ์ƒ์„ฑ ## Ratings์—์„œ ์ฝ์€ ์ฑ…๋“ค์˜ ์–ธ์–ด๋ฅผ ์ „๋ถ€ ์ˆ˜์ง‘ํ•ด์„œ ํ•ด๋‹น ์–ธ์–ด์˜ ์ฑ…๋“ค์„ ๊ฐ€๋Šฅํ•œ ์–ธ์–ด๋กœ ์„ค์ • language = pd.merge(train, books[['book_id', 'language_code']], how='left', on='book_id')
language_list = language.groupby(['user_id'])['language_code'].agg({'unique'}).reset_index() language_list.head()
ย 
ย 
ย 

03. goodbooks-10k Collaborative Filtering Model

์—ฌ๊ธฐ์„œ๋Š” collaborative filtering model์„ ์‚ฌ์šฉํ•˜์—ฌ ์‚ฌ์šฉ์ž๋“ค์—๊ฒŒ ์ถ”์ฒœ์„ ํ•ด์ฃผ๋Š” ์ฝ”๋“œ๋ฅผ ์ž‘์„ฑํ•ด ๋ณด๊ฒ ์Šต๋‹ˆ๋‹ค.
#ํ•„์š”ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ import pandas as pd import numpy as np import plotnine from plotnine import * import os, sys, gc from tqdm.notebook import tqdm
#๊ฒฝ๋กœ ์„ค์ •ํ•˜๊ธฐ path = '../input/t-academy-recommendation2/books/'
#๊ฒฝ๋กœ๋กœ ๋ถ€ํ„ฐ csvํŒŒ์ผ ๊ฐ€์ ธ์˜ค๊ธฐ books = pd.read_csv(path + "books.csv") book_tags = pd.read_csv(path + "book_tags.csv") train = pd.read_csv(path + "train.csv") test = pd.read_csv(path + "test.csv") tags = pd.read_csv(path + "tags.csv") to_read = pd.read_csv(path + "to_read.csv")
#book_id๊ฐ’์„ string์œผ๋กœ ๋ฐ”๊พธ๊ธฐ train['book_id'] = train['book_id'].astype(str) test['book_id'] = test['book_id'].astype(str) books['book_id'] = books['book_id'].astype(str)
์ด 23๊ฐœ์˜ column์œผ๋กœ ๊ตฌ์„ฑ๋˜์–ด ์žˆ์ง€๋งŒ ๊ฐ€๋…์„ฑ์„ ์œ„ํ•ด ๋’ท๋ถ€๋ถ„์„ ์ž˜๋ผ์„œ ๊ฐ€์ ธ์™”์Šต๋‹ˆ๋‹ค. ์—ฌ๊ธฐ์„œ ์ถ”์ฒœ์„ ์œ„ํ•ด books_count๋กœ ์ •๋ ฌ๋œ book_id๋ฅผ ์ถ”์ถœํ•˜์˜€์Šต๋‹ˆ๋‹ค.
notion image
ย 
popular_rec_model = books.sort_values(by='books_count', ascending=False)['book_id'].values[0:500]
ย 
์‚ฌ์šฉ์ž์—๊ฒŒ ์ ์ ˆํ•œ ์ถ”์ฒœ์„ ํ•ด์ฃผ๊ธฐ ์œ„ํ•ด user_id๋ฅผ ๊ธฐ์ค€์œผ๋กœ ์ฝ์—ˆ๋˜ book_id๋“ค์„ ๋ชจ๋‘ ๋ฌถ์–ด์ค๋‹ˆ๋‹ค.
sol = test.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index() gt = {} for user in tqdm(sol['user_id'].unique()): gt[user] = list(sol[sol['user_id'] == user]['unique'].values[0])
ย 
notion image
rec_df = pd.DataFrame() rec_df['user_id'] = train['user_id'].unique()
์—ฌ๊ธฐ์„œ ๋ถ€ํ„ฐ๋Š” Matrix Factorization์— ๊ด€ํ•œ ์ฝ”๋“œ์ž…๋‹ˆ๋‹ค. fit, cost, gradient, gradient_descent, get_prediction, get_complete_matrix ํ•จ์ˆ˜๋กœ ์ด๋ฃจ์–ด์ ธ ์žˆ์Šต๋‹ˆ๋‹ค.
ย 
fit: epoch๋งŒํผ ๋ฐ˜๋ณตํ•˜๋ฉด์„œ cost๋ฅผ ๊ณ„์‚ฐํ•˜๊ณ  gradient descent๋ฅผ ์ง„ํ–‰ํ•˜๋Š” ํ•จ์ˆ˜
cost: ์˜ˆ์ธก๊ณผ ์‹ค์ œ ๊ฐ’์˜ ์˜ค์ฐจ์ œ๊ณฑํ•ฉ์„ ๊ตฌํ•˜๋Š” ํ•จ์ˆ˜
gradient: ๊ธฐ์šธ๊ธฐ๋ฅผ ๊ตฌํ•˜๋Š” ํ•จ์ˆ˜
gradient_descent: ๊ธฐ์šธ๊ธฐ์™€ learning_rate๋ฅผ ๊ฐ€์ง€๊ณ  matrix๋ฅผ ๊ฐฑ์‹ ํ•˜๋Š” ํ•จ์ˆ˜
get_prediction: ํ–‰๋ ฌ๋ถ„ํ•ด๋ฅผ ํ†ตํ•ด ์–ป์€ ์˜ˆ์ธก๊ฐ’์„ ๊ตฌํ•˜๋Š” ํ•จ์ˆ˜
ย 
import numpy as np from tqdm import tqdm_notebook as tqdm import numpy as np # Base code : https://yamalab.tistory.com/92 class MatrixFactorization(): def __init__(self, R, k, learning_rate, reg_param, epochs, verbose=False): """ :param R: rating matrix :param k: latent parameter :param learning_rate: alpha on weight update :param reg_param: beta on weight update :param epochs: training epochs :param verbose: print status """ self._R = R self._num_users, self._num_items = R.shape self._k = k self._learning_rate = learning_rate self._reg_param = reg_param self._epochs = epochs self._verbose = verbose def fit(self): """ training Matrix Factorization : Update matrix latent weight and bias ์ฐธ๊ณ : self._b์— ๋Œ€ํ•œ ์„ค๋ช… - global bias: input R์—์„œ ํ‰๊ฐ€๊ฐ€ ๋งค๊ฒจ์ง„ rating์˜ ํ‰๊ท ๊ฐ’์„ global bias๋กœ ์‚ฌ์šฉ - ์ •๊ทœํ™” ๊ธฐ๋Šฅ. ์ตœ์ข… rating์— ์Œ์ˆ˜๊ฐ€ ๋“ค์–ด๊ฐ€๋Š” ๊ฒƒ ๋Œ€์‹  latent feature์— ์Œ์ˆ˜๊ฐ€ ํฌํ•จ๋˜๋„๋ก ํ•ด์คŒ. :return: training_process """ # init latent features self._P = np.random.normal(size=(self._num_users, self._k)) self._Q = np.random.normal(size=(self._num_items, self._k)) # init biases self._b_P = np.zeros(self._num_users) self._b_Q = np.zeros(self._num_items) self._b = np.mean(self._R[np.where(self._R != 0)]) # train while epochs self._training_process = [] for epoch in range(self._epochs): # rating์ด ์กด์žฌํ•˜๋Š” index๋ฅผ ๊ธฐ์ค€์œผ๋กœ training xi, yi = self._R.nonzero() for i, j in zip(xi, yi): self.gradient_descent(i, j, self._R[i, j]) cost = self.cost() self._training_process.append((epoch, cost)) # print status if self._verbose == True and ((epoch + 1) % 10 == 0): print("Iteration: %d ; cost = %.4f" % (epoch + 1, cost)) def cost(self): """ compute root mean square error :return: rmse cost """ # xi, yi: R[xi, yi]๋Š” nonzero์ธ value๋ฅผ ์˜๋ฏธํ•œ๋‹ค. # ์ฐธ๊ณ : http://codepractice.tistory.com/90 xi, yi = self._R.nonzero() # predicted = self.get_complete_matrix() cost = 0 for x, y in zip(xi, yi): cost += pow(self._R[x, y] - self.get_prediction(x, y), 2) return np.sqrt(cost/len(xi)) def gradient(self, error, i, j): """ gradient of latent feature for GD :param error: rating - prediction error :param i: user index :param j: item index :return: gradient of latent feature tuple """ dp = (error * self._Q[j, :]) - (self._reg_param * self._P[i, :]) dq = (error * self._P[i, :]) - (self._reg_param * self._Q[j, :]) return dp, dq def gradient_descent(self, i, j, rating): """ graident descent function :param i: user index of matrix :param j: item index of matrix :param rating: rating of (i,j) """ # get error prediction = self.get_prediction(i, j) error = rating - prediction # update biases self._b_P[i] += self._learning_rate * (error - self._reg_param * self._b_P[i]) self._b_Q[j] += self._learning_rate * (error - self._reg_param * self._b_Q[j]) # update latent feature dp, dq = self.gradient(error, i, j) self._P[i, :] += self._learning_rate * dp self._Q[j, :] += self._learning_rate * dq def get_prediction(self, i, j): """ get predicted rating: user_i, item_j :return: prediction of r_ij """ return self._b + self._b_P[i] + self._b_Q[j] + self._P[i, :].dot(self._Q[j, :].T) def get_complete_matrix(self): """ computer complete matrix PXQ + P.bias + Q.bias + global bias - PXQ ํ–‰๋ ฌ์— b_P[:, np.newaxis]๋ฅผ ๋”ํ•˜๋Š” ๊ฒƒ์€ ๊ฐ ์—ด๋งˆ๋‹ค bias๋ฅผ ๋”ํ•ด์ฃผ๋Š” ๊ฒƒ - b_Q[np.newaxis:, ]๋ฅผ ๋”ํ•˜๋Š” ๊ฒƒ์€ ๊ฐ ํ–‰๋งˆ๋‹ค bias๋ฅผ ๋”ํ•ด์ฃผ๋Š” ๊ฒƒ - b๋ฅผ ๋”ํ•˜๋Š” ๊ฒƒ์€ ๊ฐ element๋งˆ๋‹ค bias๋ฅผ ๋”ํ•ด์ฃผ๋Š” ๊ฒƒ - newaxis: ์ฐจ์›์„ ์ถ”๊ฐ€ํ•ด์คŒ. 1์ฐจ์›์ธ Latent๋“ค๋กœ 2์ฐจ์›์˜ R์— ํ–‰/์—ด ๋‹จ์œ„ ์—ฐ์‚ฐ์„ ํ•ด์ฃผ๊ธฐ์œ„ํ•ด ์ฐจ์›์„ ์ถ”๊ฐ€ํ•˜๋Š” ๊ฒƒ. :return: complete matrix R^ """ return self._b + self._b_P[:, np.newaxis] + self._b_Q[np.newaxis:, ] + self._P.dot(self._Q.T)
ย 
์ •์˜ํ•œ MatrixFactorization์„ ์‚ฌ์šฉํ•˜๋Š” ๊ณผ์ •์ž…๋‹ˆ๋‹ค.
%%time factorizer = MatrixFactorization(R, k=20, learning_rate=0.01, reg_param=0.01, epochs=100, verbose=True) factorizer.fit()
sgd_rec_model = factorizer.get_complete_matrix()
# ๋‚ด๊ฐ€ ์ฝ์€ ์ฑ…์˜ ๋ชฉ๋ก์„ ์ถ”์ถœ read_list = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index() read_list.head()
total_rec_list = {} for user in tqdm(data['useridx'].unique()): rec_list = [] # ๊ธฐ์กด์— ๋งŒ๋“  Book ID๋ฅผ ๋ณ€๊ฒฝ rating_scores = [(idx2book[i], c) for i, c in enumerate(sgd_rec_model[user]) if i != user] # ์ž๊ธฐ ์ž์‹ ์ด ์ถ”์ฒœ์•ˆ๋˜๋„๋ก rating_scores = sorted(rating_scores, key = lambda x: x[1], reverse=True) # ํ‰์ ์ด ๋†’์€ ์ˆœ์„œ๋Œ€๋กœ ์ •๋ ฌ seen = read_list[read_list['user_id'] == idx2user[user]]['unique'].values[0] for rec in rating_scores[0:250]: if rec[0] not in seen: rec_list.append(rec[0]) if len(rec_list) < 200: for i in popular_rec_model[0:200]: if rec not in seen: rec_list.append(rec) total_rec_list[idx2user[user]] = rec_list[0:200]
ย 
Map, NDCG, entropy_diversity๋ผ๋Š” 3๊ฐ€์ง€ ํ‰๊ฐ€ํ•จ์ˆ˜์— ๋Œ€ํ•ด ์ •์˜ํ•œ ์ฝ”๋“œ์ž…๋‹ˆ๋‹ค.
import six import math # https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py class evaluate(): def __init__(self, recs, gt, topn=100): self.recs = recs self.gt = gt self.topn = topn def _ndcg(self): Q, S = 0.0, 0.0 for u, seen in six.iteritems(self.gt): seen = list(set(seen)) rec = self.recs.get(u, []) if not rec or len(seen) == 0: continue dcg = 0.0 idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))]) for i, r in enumerate(rec): if r not in seen: continue rank = i + 1 dcg += 1.0 / math.log(rank + 1, 2) ndcg = dcg / idcg S += ndcg Q += 1 return S / Q def _map(self): n, ap = 0.0, 0.0 for u, seen in six.iteritems(self.gt): seen = list(set(seen)) rec = self.recs.get(u, []) if not rec or len(seen) == 0: continue _ap, correct = 0.0, 0.0 for i, r in enumerate(rec): if r in seen: correct += 1 _ap += (correct / (i + 1.0)) _ap /= min(len(seen), len(rec)) ap += _ap n += 1.0 return ap / n def _entropy_diversity(self): sz = float(len(self.recs)) * self.topn freq = {} for u, rec in six.iteritems(self.recs): for r in rec: freq[r] = freq.get(r, 0) + 1 ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)]) return ent def _evaluate(self): print('MAP@%s: %s' % (self.topn, self._map())) print('NDCG@%s: %s' % (self.topn, self._ndcg())) print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200) evaluate_func._evaluate()
ย 
ย 
ALS๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์ฝ”๋“œ๋ฅผ ๊ตฌํ˜„ํ•˜์˜€์Šต๋‹ˆ๋‹ค.
from implicit.evaluation import * from implicit.als import AlternatingLeastSquares as ALS from implicit.bpr import BayesianPersonalizedRanking as BPR als_model = ALS(factors=20, regularization=0.01, iterations = 100) als_model.fit(purchase_sparse.T)
als_model.recommend(0, purchase_sparse, N=200)[0:10]
total_rec_list = {} for user in tqdm(data['useridx'].unique()): rec_list = [] # ๊ธฐ์กด์— ๋งŒ๋“  Book ID๋ฅผ ๋ณ€๊ฒฝ seen = read_list[read_list['user_id'] == idx2user[user]]['unique'].values[0] recs = als_model.recommend(user, purchase_sparse, N=250) recs = [idx2book[x[0]] for x in recs][0:250] for rec in recs: if rec not in seen: rec_list.append(rec) if len(rec_list) < 200: for i in popular_rec_model[0:200]: if rec not in seen: rec_list.append(rec) total_rec_list[idx2user[user]] = rec_list[0:200]
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200) evaluate_func._evaluate()

04. goodbooks-10k Contents Based Model