DevilKing's blog

冷灯看剑,剑上几分功名?炉香无需计苍生,纵一穿烟逝,万丈云埋,孤阳还照古陵

0%

Page Rank

PageRank的相关说明

  • 整个互联网,看做一张有向图
    • 网页是图上的点
    • 链接是图上的又向边
  • 每个网页有一个权威评分,称作pagerank,可以把它当做一种投票权
  • 将每个超链接作为一次投票
    • 每个网页的pagerank等于所有指向该网页超链接的网页的pagerank的加权和
    • 这些权值等于这些网页各自向外链接数目的倒数

里面主要的点:

  • 足够优美,表达简单
  • 关于加权和部分,计算方式聪明
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

class Vertex:
def __init__(self):
self.in_degree = 0
self.out_degree = 0
self.pagerank = 0.0

class Edge:
def __init__(self, start, end):
self.start_id = start
self.end_id = end

def addVertex(vertex_name, vtx_map):
'''
return the id of vertex_name
if exists in vtx_map, return directly
if not exists, add it, and then return

vertex_name: string, the url of a web page
vtx_map: dict, the map from url to id
'''
res_id = 0
if vertex_name in vtx_map:
return vtx_map[vertex_name]
else:
res_id = len(vtx_map)
vtx_map[vertex_name] = res_id
return res_id

def readTable(fname, vtx_map, edge_list):
'''
read fname line by line, update the vtx_map and edge_list

fname: string, the file name of the table
vtx_map: dict, the map from url to id
edge_list: list, the list of all edges
'''
with open(fname, 'r') as fin:
for line in fin.readlines():
tmp = line.strip().split('\t')
assert(len(tmp) == 2)
start = addVertex(tmp[0], vtx_map)
end = addVertex(tmp[1], vtx_map)
edge_list.append(Edge(start, end))
return None

def initialize(vtx_map, edge_list, vtx_list):
'''
initialize the data structures

vtx_map: dict, the map from url to id
edge_list: list, the list of all edges
vtx_list: list, the list of all vertices
'''
vtx_num = len(vtx_map)
assert(vtx_num > 0)
vtx_list = [Vertex() for _ in range(vtx_num)]
for i in range(vtx_num):
vtx_list[i].pagerank = 1.0 / vtx_num
for edge in edge_list:
vtx_list[edge.start_id].out_degree += 1
vtx_list[edge.end_id].in_degree += 1
return None

def calcPagerank(alpha, num_iter, vtx_map, edge_list):
'''
calc PageRank for all vertices
return: vtx_list, list, the list of all vertices

alpha: float, damping factor
num_iter: int, the upper limitation of calculation
vtx_map: dict, the map from url to id
edge_list: list, the list of all edges
'''
vtx_list, pr_list = list(), list()
initialize(vtx_map, edge_list, vtx_list)
vtx_num = len(vtx_list)
assert(vtx_num > 0)
alpha = float(alpha)
for _ in range(num_iter):
pr_list = [alpha / vtx_num for _ in range(vtx_num)]
# calc
for edge in edge_list:
pr_list += (1 - alpha) * vtx_list[edge.start_id].pagerank / \
vtx_list[edge.start_id].out_degree
# revise
revise = sum(map(lambda vtx: (1 - alpha) * vtx.pagerank / vtx_num, \
filter(lambda vtx:(vtx.out_degree == 0), vtx_list)))
# update
for i in range(vtx_num):
vtx_list[i].pagerank = pr_list[i] + revise
return vtx_list

def doCalcPageRank(fname = 'wt2g_inlinks.source', alpha = 0.15, num_iter = 30):
vtx_map = dict()
edge_list = list()
readTable(fname, vtx_map, edge_list)
return calcPagerank(alpha, num_iter, vtx_map, edge_list)

if __name__ == '__main__':
print doCalcPageRank()


相关程序说明: