2014年10月29日 星期三

[Linux] 使用 MongoDB Aggregate (Map-Reduce) 實作共現矩陣(Co-Occurrence Matrix)及簡易的推薦系統 @ Ubuntu 14.04


這陣子查閱了一些推薦系統的作法,發現早在 2012 年網路上就有非常多 Apache Mahout 的使用心得,這一套已經內含許多常見的演算法精華,故許多人都是採用 Apache Mahout 搭建推薦系統的。至於為何還要自己刻一個出來?單純練功啦,在此就練練 Partition、Sort 和 Merge (Map-Reduce) 的分析方式,但採用 MongoDB Aggregate 實作。

此例挑選 user, item 的關聯紀錄,例如 user1 買了 item1, item2, item5,而 user2 買了 item3, item4 等。在推薦系統中,大多就是將 input 前置處理,在挑恰當的演算法加以計算,接著處理 output。在此採用共現矩陣(Co-Occurrence Matrix) 的用法,以 user, item 的關聯紀錄,可以統計出一個 Item-based Co-Occurrence Matrix,簡言之就是各個 item 之間的關係,例如透過 item 1 跟 item 2 一起出現現象,整理成一個計算方式。

假設共有 a, b, c 三個使用者,而有 1, 2, 3, 4, 5,共有五種物品,其中 a, b, c 分別購買部分用品,並對各物品的評價(或價錢等意義)如下:

a, 1, 0.3
a, 3, 0.5
a, 4, 0.9

b, 2, 0.1
b, 3, 0.6
b, 5, 0.2

c, 3, 0.2
c, 4, 0.7


其中, 共現矩陣(Co-Occurrence Matrix) 如下:

\  1, 2, 3, 4, 5
1: 1, 0, 1, 1, 0
2: 0, 1, 1, 0, 1
3: 1, 1, 3, 2, 1
4: 1, 0, 2, 2, 0
5: 0, 1, 1, 0, 1


(1, 1) = 1, 代表 1 僅出現在 1 個使用者的清單
(4, 4) = 2, 代表 4 出現在 2 個使用者的清單
(3, 3) = 3, 代表 3 出現在 3 個使用者的清單
(4, 3) = 2, 代表 4 跟 3 同時出現的次數有兩次

接著,把使用者的購物清單也向量化(或矩陣):

\    a,   b,   c
1: 0.3,   0,   0,
2:   0, 0.1,   0,
3: 0.5, 0.6, 0.2,
4: 0.9,   0, 0.7,
5:   0, 0.2,   0,


當這兩個矩陣相成的結果,即為使用者與物品的關聯成果,且每個物品的分數即可當作使用者感興趣的分數。


以 MongoDB 為例,先將資料一筆筆新增至 db.rec 中:

db.rec.drop()
db.rec.insert({user:"a", item: "1", rate: 0.3})
db.rec.insert({user:"a", item: "3", rate: 0.5})
db.rec.insert({user:"a", item: "4", rate: 0.9})
db.rec.insert({user:"b", item: "2", rate: 0.1})
db.rec.insert({user:"b", item: "3", rate: 0.6})
db.rec.insert({user:"b", item: "5", rate: 0.2})
db.rec.insert({user:"c", item: "3", rate: 0.2})
db.rec.insert({user:"c", item: "4", rate: 0.7})

> db.rec.find()
{ "_id" : ObjectId("5450dedfbdfb5bad287c953c"), "user" : "a", "item" : "1", "rate" : 0.3 }
{ "_id" : ObjectId("5450dedfbdfb5bad287c953d"), "user" : "a", "item" : "3", "rate" : 0.5 }
{ "_id" : ObjectId("5450dedfbdfb5bad287c953e"), "user" : "a", "item" : "4", "rate" : 0.9 }
{ "_id" : ObjectId("5450dedfbdfb5bad287c953f"), "user" : "b", "item" : "2", "rate" : 0.1 }
{ "_id" : ObjectId("5450dedfbdfb5bad287c9540"), "user" : "b", "item" : "3", "rate" : 0.6 }
{ "_id" : ObjectId("5450dedfbdfb5bad287c9541"), "user" : "b", "item" : "5", "rate" : 0.2 }
{ "_id" : ObjectId("5450dedfbdfb5bad287c9542"), "user" : "c", "item" : "3", "rate" : 0.2 }
{ "_id" : ObjectId("5450dee0bdfb5bad287c9543"), "user" : "c", "item" : "4", "rate" : 0.7 }


接著分析 user 跟 item 的個數,分別儲存在 db.user 跟 db.item 中:

> db.item.drop()
true
> db.rec.aggregate([{$group: {_id: "$item"}}, {$sort: {_id: 1}}]).forEach(function(input) {
db.item.insert(input);
})
> db.item.find()
{ "_id" : "1" }
{ "_id" : "2" }
{ "_id" : "3" }
{ "_id" : "4" }
{ "_id" : "5" }

> db.user.drop()
> db.rec.aggregate([{$group: {_id: "$user"}}, {$sort: {_id: 1}}]).forEach(function(input) {
db.user.insert(input);
})
> db.user.find()
{ "_id" : "a" }
{ "_id" : "b" }
{ "_id" : "c" }


建立 Item-based Co-Occurrence Matrix(在此採用sparse matrix):

> db.user_item_list.drop();
> db.rec.aggregate({ $group: {_id: "$user", items: {$push: "$item"} }}).forEach(function(input){
db.user_item_list.insert(input);
})

> db.comatrix_input.drop()
> db.user_item_list.find().forEach(function(input){
for (var i in input.items){
var key = input.items[i]+"-"+input.items[i];
db.comatrix_input.insert( { key: key, value: 1} );
for (var j in input.items){
if (j>i) {
var key = input.items[i]+"-"+input.items[j];
db.comatrix_input.insert( { key: key, value: 1} );
key = input.items[j]+"-"+input.items[i];
db.comatrix_input.insert( { key: key, value: 1} );
}
}
}
});
> db.comatrix.drop()
> db.comatrix_input.aggregate({ $group: {_id: "$key", value: {$sum: "$value"}}}).forEach(function(input) {
db.comatrix.insert(input)
});
> db.comatrix.find()
{ "_id" : "1-3", "value" : 1 }
{ "_id" : "3-5", "value" : 1 }
{ "_id" : "5-2", "value" : 1 }
{ "_id" : "1-4", "value" : 1 }
{ "_id" : "3-1", "value" : 1 }
{ "_id" : "3-2", "value" : 1 }
{ "_id" : "2-5", "value" : 1 }
{ "_id" : "2-3", "value" : 1 }
{ "_id" : "1-1", "value" : 1 }
{ "_id" : "3-3", "value" : 3 }
{ "_id" : "5-3", "value" : 1 }
{ "_id" : "4-1", "value" : 1 }
{ "_id" : "4-3", "value" : 2 }
{ "_id" : "3-4", "value" : 2 }
{ "_id" : "5-5", "value" : 1 }
{ "_id" : "4-4", "value" : 2 }
{ "_id" : "2-2", "value" : 1 }


透過共現矩陣推算 user 對 item 的喜好程度:

> db.user_prefer.drop()
> db.item.find().forEach(function(in1) {
var value = 0;
db.item.find().forEach(function(in2) {
var key = in1._id+"-"+in2._id;
var factorInfo = db.comatrix.findOne( {_id : key} )
if (factorInfo && factorInfo.value) {
db.user.find().forEach(function(in3){
var user_id = in3._id
var get = db.rec.findOne({user: in3._id, item: in2._id})
if (get)
db.user_prefer.insert( {user: in3._id, item: in1._id, value: factorInfo.value * get.rate} )
})
}
})
})

> db.recommendation.drop()
> db.user_prefer.aggregate([{$group: {_id: { user: "$user", item:"$item"}, value: {$sum: "$value"}}}, {$sort: {_id: 1}}]).forEach(function(input){
db.recommendation.insert({_id: input._id, user: input._id.user, item: input._id.item, value: input.value})
})
> db.recommendation.find()
{ "_id" : { "user" : "a", "item" : "1" }, "user" : "a", "item" : "1", "value" : 1.7000000000000002 }
{ "_id" : { "user" : "a", "item" : "2" }, "user" : "a", "item" : "2", "value" : 0.5 }
{ "_id" : { "user" : "a", "item" : "3" }, "user" : "a", "item" : "3", "value" : 3.6 }
{ "_id" : { "user" : "a", "item" : "4" }, "user" : "a", "item" : "4", "value" : 3.1 }
{ "_id" : { "user" : "a", "item" : "5" }, "user" : "a", "item" : "5", "value" : 0.5 }
{ "_id" : { "user" : "b", "item" : "1" }, "user" : "b", "item" : "1", "value" : 0.6 }
{ "_id" : { "user" : "b", "item" : "2" }, "user" : "b", "item" : "2", "value" : 0.8999999999999999 }
{ "_id" : { "user" : "b", "item" : "3" }, "user" : "b", "item" : "3", "value" : 2.1 }
{ "_id" : { "user" : "b", "item" : "4" }, "user" : "b", "item" : "4", "value" : 1.2 }
{ "_id" : { "user" : "b", "item" : "5" }, "user" : "b", "item" : "5", "value" : 0.8999999999999999 }
{ "_id" : { "user" : "c", "item" : "1" }, "user" : "c", "item" : "1", "value" : 0.8999999999999999 }
{ "_id" : { "user" : "c", "item" : "2" }, "user" : "c", "item" : "2", "value" : 0.2 }
{ "_id" : { "user" : "c", "item" : "3" }, "user" : "c", "item" : "3", "value" : 2 }
{ "_id" : { "user" : "c", "item" : "4" }, "user" : "c", "item" : "4", "value" : 1.7999999999999998 }
{ "_id" : { "user" : "c", "item" : "5" }, "user" : "c", "item" : "5", "value" : 0.2 }


清掉 user 已擁有 items:

> db.rec.find().forEach(function(input){
var key = { "user": input.user, "item": input.item }
db.recommendation.remove( { _id : { $eq: key } } )
})

> db.recommendation.find()
{ "_id" : { "user" : "a", "item" : "2" }, "user" : "a", "item" : "2", "value" : 0.5 }
{ "_id" : { "user" : "a", "item" : "5" }, "user" : "a", "item" : "5", "value" : 0.5 }
{ "_id" : { "user" : "b", "item" : "1" }, "user" : "b", "item" : "1", "value" : 0.6 }
{ "_id" : { "user" : "b", "item" : "4" }, "user" : "b", "item" : "4", "value" : 1.2 }
{ "_id" : { "user" : "c", "item" : "1" }, "user" : "c", "item" : "1", "value" : 0.8999999999999999 }
{ "_id" : { "user" : "c", "item" : "2" }, "user" : "c", "item" : "2", "value" : 0.2 }
{ "_id" : { "user" : "c", "item" : "5" }, "user" : "c", "item" : "5", "value" : 0.2 }


收工!

沒有留言:

張貼留言