Skip to content
gqlxj1987's Blog
Go back

Page Rank

Edit page

PageRank的相关说明

里面主要的点:


class Vertex:
    def __init__(self):
        self.in_degree = 0
        self.out_degree = 0
        self.pagerank = 0.0

class Edge:
    def __init__(self, start, end):
        self.start_id = start
        self.end_id = end

def addVertex(vertex_name, vtx_map):
    '''
    return the id of vertex_name
    if exists in vtx_map, return directly
    if not exists, add it, and then return

    vertex_name: string, the url of a web page
    vtx_map: dict, the map from url to id
    '''
    res_id = 0
    if vertex_name in vtx_map:
        return vtx_map[vertex_name]
    else:
        res_id = len(vtx_map)
        vtx_map[vertex_name] = res_id
    return res_id

def readTable(fname, vtx_map, edge_list):
    '''
    read fname line by line, update the vtx_map and edge_list

    fname: string, the file name of the table
    vtx_map: dict, the map from url to id
    edge_list: list, the list of all edges
    '''
    with open(fname, 'r') as fin:
        for line in fin.readlines():
            tmp = line.strip().split('\t')
            assert(len(tmp) == 2)
            start = addVertex(tmp[0], vtx_map)
            end = addVertex(tmp[1], vtx_map)
            edge_list.append(Edge(start, end))
    return None

def initialize(vtx_map, edge_list, vtx_list):
    '''
    initialize the data structures

    vtx_map: dict, the map from url to id
    edge_list: list, the list of all edges
    vtx_list: list, the list of all vertices
    '''
    vtx_num = len(vtx_map)
    assert(vtx_num > 0)
    vtx_list = [Vertex() for _ in range(vtx_num)]
    for i in range(vtx_num):
        vtx_list[i].pagerank = 1.0 / vtx_num
    for edge in edge_list:
        vtx_list[edge.start_id].out_degree += 1
        vtx_list[edge.end_id].in_degree += 1
    return None

def calcPagerank(alpha, num_iter, vtx_map, edge_list):
    '''
    calc PageRank for all vertices
    return: vtx_list, list, the list of all vertices

    alpha: float, damping factor
    num_iter: int, the upper limitation of calculation
    vtx_map: dict, the map from url to id
    edge_list: list, the list of all edges
    '''
    vtx_list, pr_list = list(), list()
    initialize(vtx_map, edge_list, vtx_list)
    vtx_num = len(vtx_list)
    assert(vtx_num > 0)
    alpha = float(alpha)
    for _ in range(num_iter):
        pr_list = [alpha / vtx_num for _ in range(vtx_num)]
        # calc
        for edge in edge_list:
            pr_list += (1 - alpha) * vtx_list[edge.start_id].pagerank / \
                        vtx_list[edge.start_id].out_degree
        # revise
        revise = sum(map(lambda vtx: (1 - alpha) * vtx.pagerank / vtx_num, \
                        filter(lambda vtx:(vtx.out_degree == 0), vtx_list)))
        # update
        for i in range(vtx_num):
            vtx_list[i].pagerank = pr_list[i] + revise
    return vtx_list

def doCalcPageRank(fname = 'wt2g_inlinks.source', alpha = 0.15, num_iter = 30):
    vtx_map = dict()
    edge_list = list()
    readTable(fname, vtx_map, edge_list)
    return calcPagerank(alpha, num_iter, vtx_map, edge_list)

if __name__ == '__main__':
    print doCalcPageRank()

相关程序说明:


Edit page
Share this post on:

Previous Post
Go performance
Next Post
莫道不肝肠