추천시스템의 기본 및 컨텐츠 기반 필터링 실습


잠재요인 최근접 이웃 이해하기

import numpy as np

# 원본 행렬 R 생성
R = np.array([[4, np.NaN, np.NaN, 2, np.NaN ],
              [np.NaN, 5, np.NaN, 3, 1 ],
              [np.NaN, np.NaN, 3, 4, 4 ],
              [5, 2, 1, 2, np.NaN ]])
num_users, num_items = R.shape
K=3

# P,Q 행렬의 크기를 지정하고 정규분포를 가진 임의의 값 임력
np.random.seed(1)
P = np.random.normal(scale=1./K, size=(num_users, K))
Q = np.random.normal(scale=1./K, size=(num_items, K))
P
array([[ 0.54144845, -0.2039188 , -0.17605725],
       [-0.35765621,  0.28846921, -0.76717957],
       [ 0.58160392, -0.25373563,  0.10634637],
       [-0.08312346,  0.48736931, -0.68671357]])
Q
array([[-0.1074724 , -0.12801812,  0.37792315],
       [-0.36663042, -0.05747607, -0.29261947],
       [ 0.01407125,  0.19427174, -0.36687306],
       [ 0.38157457,  0.30053024,  0.16749811],
       [ 0.30028532, -0.22790929, -0.04096341]])
num_users
4
np.dot(P,Q.T)
array([[-9.86215743e-02, -1.35273244e-01,  3.25938576e-02,
         1.15829937e-01,  2.16275915e-01],
       [-2.88426030e-01,  3.39039250e-01,  3.32466259e-01,
        -1.78279922e-01, -1.41717429e-01],
       [ 1.01671414e-02, -2.29768982e-01, -8.01253853e-02,
         1.63482851e-01,  2.28119515e-01],
       [-3.12983578e-01,  2.03409279e-01,  3.45449141e-01,
        -2.71808518e-04, -1.07906618e-01]])
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두개의 분해된 행렬 P와 Q.T의 내적으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R 행렬에서 널이 아닌 값의 위치 인덱스 추출하여 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
      
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse
non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]

steps=1000
learning_rate=0.01
r_lambda=0.01

# SGD 기법으로 P와 Q 매트릭스를 계속 업데이트. 
for step in range(steps):
    for i, j, r in non_zeros:
        # 실제 값과 예측 값의 차이인 오류 값 구함
        eij = r - np.dot(P[i, :], Q[j, :].T)
        # Regularization을 반영한 SGD 업데이트 공식 적용
        P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
        Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])

    rmse = get_rmse(R, P, Q, non_zeros)
    if (step % 50) == 0 :
        print("### iteration step : ", step," rmse : ", rmse)
### iteration step :  0  rmse :  3.2388050277987723
### iteration step :  50  rmse :  0.48767231013696477
### iteration step :  100  rmse :  0.1564340384819248
### iteration step :  150  rmse :  0.07455141311978032
### iteration step :  200  rmse :  0.0432522679857931
### iteration step :  250  rmse :  0.029248328780879088
### iteration step :  300  rmse :  0.022621116143829344
### iteration step :  350  rmse :  0.01949363619652533
### iteration step :  400  rmse :  0.018022719092132503
### iteration step :  450  rmse :  0.01731968595344277
### iteration step :  500  rmse :  0.01697365788757103
### iteration step :  550  rmse :  0.016796804595895533
### iteration step :  600  rmse :  0.01670132290188455
### iteration step :  650  rmse :  0.016644736912476654
### iteration step :  700  rmse :  0.016605910068210192
### iteration step :  750  rmse :  0.01657420047570466
### iteration step :  800  rmse :  0.016544315829215932
### iteration step :  850  rmse :  0.01651375177473506
### iteration step :  900  rmse :  0.016481465738195183
### iteration step :  950  rmse :  0.016447171683479173
pred_matrix = np.dot(P, Q.T)
print(pred_matrix)
[[3.99062329 0.89653623 1.30649077 2.00210666 1.66340846]
 [6.69571106 4.97792757 0.97850229 2.98066034 1.0028451 ]
 [6.67689303 0.39076095 2.98728588 3.9769208  3.98610743]
 [4.96790858 2.00517956 1.00634763 2.01691675 1.14044567]]

TMDB 데이터 - 컨텐츠 기반 필터링

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

movies = pd.read_csv("./data/tmdb/tmdb_5000_movies.csv")
print(movies.shape)
movies.head(3)
(4803, 20)
budgetgenreshomepageidkeywordsoriginal_languageoriginal_titleoverviewpopularityproduction_companiesproduction_countriesrelease_daterevenueruntimespoken_languagesstatustaglinetitlevote_averagevote_count
0237000000[{"id": 28, "name": "Action"}, {"id": 12, "nam...http://www.avatarmovie.com/19995[{"id": 1463, "name": "culture clash"}, {"id":...enAvatarIn the 22nd century, a paraplegic Marine is di...150.437577[{"name": "Ingenious Film Partners", "id": 289...[{"iso_3166_1": "US", "name": "United States o...2009-12-102787965087162.0[{"iso_639_1": "en", "name": "English"}, {"iso...ReleasedEnter the World of Pandora.Avatar7.211800
1300000000[{"id": 12, "name": "Adventure"}, {"id": 14, "...http://disney.go.com/disneypictures/pirates/285[{"id": 270, "name": "ocean"}, {"id": 726, "na...enPirates of the Caribbean: At World's EndCaptain Barbossa, long believed to be dead, ha...139.082615[{"name": "Walt Disney Pictures", "id": 2}, {"...[{"iso_3166_1": "US", "name": "United States o...2007-05-19961000000169.0[{"iso_639_1": "en", "name": "English"}]ReleasedAt the end of the world, the adventure begins.Pirates of the Caribbean: At World's End6.94500
2245000000[{"id": 28, "name": "Action"}, {"id": 12, "nam...http://www.sonypictures.com/movies/spectre/206647[{"id": 470, "name": "spy"}, {"id": 818, "name...enSpectreA cryptic message from Bond’s past sends him o...107.376788[{"name": "Columbia Pictures", "id": 5}, {"nam...[{"iso_3166_1": "GB", "name": "United Kingdom"...2015-10-26880674609148.0[{"iso_639_1": "fr", "name": "Fran\u00e7ais"},...ReleasedA Plan No One EscapesSpectre6.34466
movies_df = movies[['id','title', 'genres', 'vote_average', 'vote_count', 'popularity', 'keywords', 'overview']]
pd.set_option("max_colwidth", 100)
movies_df[["genres", "keywords"]][:1]
genreskeywords
0[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {...[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "sp...
type(movies_df["genres"][0])
str
  • 현재 genre 컬럼의 리스트 내의 딕셔너리 형태의 데이터가 있음
  • 하지만 데이터 타입을 보면 string 형태임
  • literal_eval 함수를 사용해서 이를 객체로 인식시킬 수 있음
  • 즉 str을 읽어서 리스트와 딕셔너리 형태로 맞게 데이터를 바꿔줌
from ast import literal_eval # 리스트 내의 딕셔너리 형태의 스트링 값을 객체로 바꿔주기 위해 사용
movies_df["genres"] = movies_df["genres"].apply(literal_eval)
movies_df["genres"]
0       [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {...
1            [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 28, 'name': 'Action'}]
2              [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 80, 'name': 'Crime'}]
3       [{'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime'}, {'id': 18, 'name': 'Drama'}, {'id': ...
4       [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 878, 'name': 'Science Fic...
                                                       ...                                                 
4798            [{'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime'}, {'id': 53, 'name': 'Thriller'}]
4799                                       [{'id': 35, 'name': 'Comedy'}, {'id': 10749, 'name': 'Romance'}]
4800    [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}, {'...
4801                                                                                                     []
4802                                                                    [{'id': 99, 'name': 'Documentary'}]
Name: genres, Length: 4803, dtype: object
movies_df["genres"][0][0]["name"]
'Action'
  • 데이터가 잘 읽힘
movies_df["keywords"] = movies_df["keywords"].apply(literal_eval)
# name 키에 해당하는 값만 추출
movies_df["genres"] = movies_df["genres"].apply(lambda x : [y["name"] for y in x])
movies_df["keywords"] = movies_df["keywords"].apply(lambda x : [y["name"] for y in x])
movies_df[["genres", "keywords"]][:1]
genreskeywords
0[Action, Adventure, Fantasy, Science Fiction][culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa...
from sklearn.feature_extraction.text import CountVectorizer

# Count Vectorizer를 적용하기 위해 공백문자로 word 단위로 구분되는 문자열로 반환
movies_df["genres_literal"] = movies_df["genres"].apply(lambda x : (' ').join(x))
count_vect = CountVectorizer(min_df=0, ngram_range=(1, 2))
genre_mat = count_vect.fit_transform(movies_df["genres_literal"])
print(genre_mat.shape)
(4803, 276)
from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim[:1])
(4803, 4803)
[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]]
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1] # [::-1]은 내림차순 정렬
print(genre_sim_sorted_ind[:1])
[[   0 3494  813 ... 3038 3037 2401]]
  • 가장 높은 유사도를 가진 것은 0번 레코드 다음 ,3494 레코드
  • 가장 낮은 유사도는 2401번 레코드라는 의미로 해석함
# 장르 유사도에 따라 영화를 추천하는 함수 생성
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    # 인자로 입력된 title_name인 df 추출
    title_movie = df[df["title"] == title_name]
    
    title_index = title_movie.index.values
    # top_n개의 index 추출
    similar_indexes = sorted_ind[title_index, :(top_n)]
    
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, "The Godfather", 10)
similar_movies[["title", "vote_average"]]
[[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]]
titlevote_average
2731The Godfather: Part II8.3
1243Mean Streets7.2
3636Light Sleeper5.7
1946The Bad Lieutenant: Port of Call - New Orleans6.0
2640Things to Do in Denver When You're Dead6.7
4065Mi America0.0
1847GoodFellas8.2
4217Kids6.8
883Catch Me If You Can7.7
3866City of God8.1
  • Mi America는 평점이 0점
  • 낯선 영화도 많음
  • 개선이 필요함
movies_df[["title", "vote_average", "vote_count"]].sort_values("vote_average", ascending=False)[:10]
titlevote_averagevote_count
3519Stiff Upper Lips10.01
4247Me You and Five Bucks10.02
4045Dancer, Texas Pop. 8110.01
4662Little Big Top10.01
3992Sardaarji9.52
2386One Man's Hero9.32
2970There Goes My Baby8.52
1881The Shawshank Redemption8.58205
2796The Prisoner of Zenda8.411
3337The Godfather8.45893
  • vote_average순으로 내림차순 정렬해보면 vote_count가 1~2개인데 높은 점수를 준 경우가 있음
  • 이는 왜곡된 평점데이터라고 가정하고 가중치가 부여된 평점으로 개선
  • 가중평점 = (v/(v+m)) * R + (m/(v+m)) * C
    • v : 개별영화 평점을 투표한 횟수
    • m : 평점을 부여하기 위한 최소 투표횟수
    • R : 개별 영화에 대한 평균 평점
    • C : 전체 영화에 대한 평균 평점
C = movies_df["vote_average"].mean()
m = movies_df["vote_count"].quantile(0.6)
print("C:", round(C, 3), "m:", round(m, 3))
C: 6.092 m: 370.2
percentile = 0.6
m = movies_df["vote_count"].quantile(percentile)
C = movies_df["vote_average"].mean()

def weighted_vote_average(record):
    v = record["vote_count"]
    R = record["vote_average"]
    
    return ((v/(v+m))*R)+ ((m/(m+v))*C)

movies_df["weighted_vote"] = movies.apply(weighted_vote_average, axis=1)
movies_df[["title", "vote_average", "weighted_vote", "vote_count"]].sort_values("weighted_vote", ascending=False)[:10]
titlevote_averageweighted_votevote_count
1881The Shawshank Redemption8.58.3960528205
3337The Godfather8.48.2635915893
662Fight Club8.38.2164559413
3232Pulp Fiction8.38.2071028428
65The Dark Knight8.28.13693012002
1818Schindler's List8.38.1260694329
3865Whiplash8.38.1232484254
809Forrest Gump8.28.1059547927
2294Spirited Away8.38.1058673840
2731The Godfather: Part II8.38.0795863338
# 장르 유사도에 따라 영화를 추천하는 함수 생성
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    # 인자로 입력된 title_name인 df 추출
    title_movie = df[df["title"] == title_name]
    title_index = title_movie.index.values
    
    # top_n X 2개의 index 추출
    similar_indexes = sorted_ind[title_index, :(top_n*2)]
    similar_indexes = similar_indexes.reshape(-1)
    
    # 기준영화 인덱스 제외
    similar_indexes = similar_indexes[similar_indexes != title_index]
    
    return df.iloc[similar_indexes].sort_values("weighted_vote", ascending=False)[:top_n]

similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, "The Godfather", 10)
similar_movies[["title", "vote_average"]]
titlevote_average
2731The Godfather: Part II8.3
1847GoodFellas8.2
3866City of God8.1
1663Once Upon a Time in America8.2
883Catch Me If You Can7.7
281American Gangster7.4
4041This Is England7.4
1149American Hustle6.8
1243Mean Streets7.2
2839Rounders6.9



© 2021.04. by Jessie

Powered by jessie