2010年10月21日 星期四

Javascript K-means Clustering

嘗試用 Javascript 寫了 K-means algorithm,在這過程之中,讓我回想起兩年前用 C 寫 Betweenness Centrality 的點滴!偶爾拿來訓練腦袋也不錯!


在這過程之中,發現一件事情,那就是 K-means 原先是想要把一堆資料分成 K 群為目標,那是否有可能收斂時,卻沒有 K 個群呢?這個我在自己的測資中,有發現這個現象,但我不曉得是不是演算法哪邊寫錯了,還是本來就存在這個問題?


k-means
黑色代表五筆資料,紅色代表一開始假設的族群中心點


例如原先共有 5 筆資料,以二維平面來說,假設資料全部都縮在角落,然後一開始決定 k=2 時,恰好設定在兩個對角,導致在判斷 5 筆資料要歸屬那個群族時,變成一個是空的,另一個有 5 筆資料,如此下去,很快就收斂,但資料就變成只有一個族群了。解決的方式就是一開始指定群族的位置時,直接指定在實際存在的點上,這樣可以確保每個族群一開始至少有一個點。


程式碼:


try{
        console.log( 'begin' );
}catch(err){
        console = {}
        console.log = function(){}
}

//
// pointer_list = []
// pointer_list[0] = { 'x':x , 'y':y , 'group_index':-1, 'distance':-1 }
//
var pointer_list = new Array();
for( var i=0,cnt=raw_pointer.length ;i<cnt ; i++ )
        pointer_list.push(  { 'x':raw_pointer[i]['x'] , 'y':raw_pointer[i]['y'] , 'group_index': -1 , 'distance':-1 } );

var group_cnt = 100;

//
// center = []
// center[0] = { 'group_list':[] , 'x':x , 'y':y }
//
var center = new Array();
for( var i=0 ; i<group_cnt ; ++i )
        center.push( { 'group_list':[] } );

for( var i=0, cnt=pointer_list.length ; i<cnt ; ++i )
{
        var c_index = parseInt( i / ( cnt / group_cnt ) );
        if( center[ c_index ]['x'] == undefined )
        {
                center[ c_index ]['x'] = pointer_list[i]['x'];
                center[ c_index ]['y'] = pointer_list[i]['y'];
                //center[ c_index ]['group_list'].push( i );
                pointer_list[i]['group_index'] = c_index;
                pointer_list[i]['distance'] = 0;
        }
}
console.log( 'group:'+group_cnt+',pointer:'+pointer_list.length );
var wanna_finish = 0;
var run_cnt = 0;
var max_run_cnt = 30;
while( !wanna_finish && run_cnt < max_run_cnt )
{
        wanna_finish = 1;
        for( var i=0, cnt=pointer_list.length ; i<cnt ; ++i )
        {
                for( var j=0; j<group_cnt ; ++j )
                {
                        var diff_x = Math.abs( center[j]['x'] - pointer_list[i]['x'] );
                        var diff_y = Math.abs( center[j]['y'] - pointer_list[i]['y'] );
                        var diff = Math.sqrt( diff_x*diff_x + diff_y*diff_y );
                        if( pointer_list[i]['group_index'] < 0 || pointer_list[i]['distance'] > diff )
                        {
                                pointer_list[i]['group_index'] = j;
                                pointer_list[i]['distance'] = diff;
                        }
                }
                center[ pointer_list[i]['group_index'] ]['group_list'].push( i );
        }
        for( var i=0 ;i<group_cnt ; ++i )
        {

                var cnt = center[i]['group_list'].length;
                var x = 0;
                var y = 0;
                for( var k=0 ; k<cnt ; k++ )
                {
                        x += pointer_list[ center[i]['group_list'][k] ]['x'];
                        y += pointer_list[ center[i]['group_list'][k] ]['y'];
                }
                center[i]['x'] = x/cnt;
                center[i]['y'] = y/cnt;

                if(
                        center[i]['old_x'] == undefined || center[i]['old_y'] == undefined
                        || center[i]['old_x'] != center[i]['x'] || center[i]['old_y'] != center[i]['y']
                 )
                {
                        wanna_finish = 0;
                        center[i]['old_x'] = center[i]['x'];
                        center[i]['old_y'] = center[i]['y'];
                }
                console.log( run_cnt+' @ Group:'+cnt+',center['+i+']:('+center[i]['x']+','+center[i]['y']+')');
        }
        if( !wanna_finish )
        {
                if( run_cnt < max_run_cnt )
                {
                        for( var i=0 ; i<group_cnt ; ++i )
                                center[ i ]['group_list'] = [];
                }
                for( var i=0, cnt=pointer_list.length ; i<cnt ; ++i )
                {
                        pointer_list[i]['group_index'] = -1;
                }
        }
        run_cnt ++;
        console.log( 'WannaFinish:' + wanna_finish + ' @ Run:' + run_cnt );
}


其他資訊:



  • group_cnt:代表最後的 k 是幾個,此例為 100 個

  • max_run_cnt:一種額外的終止條件,避免 k-means 找太久,此例為 30 次

  • center:array list,記錄找出來的族群資訊,分別有 x, y 座標,以及 group_list 是 array list,記錄 pointer 在 pointer_list 的 index 位置

  • pointer_list:array list,記錄 x,y 座標,以及所屬的 group (記錄 index) 和與該 group 中心點的距離


沒有留言:

張貼留言