PageRank的相关说明
- 整个互联网,看做一张有向图
- 网页是图上的点
- 链接是图上的又向边
- 每个网页有一个权威评分,称作pagerank,可以把它当做一种投票权
- 将每个超链接作为一次投票
- 每个网页的pagerank等于所有指向该网页超链接的网页的pagerank的加权和
- 这些权值等于这些网页各自向外链接数目的倒数
里面主要的点:
- 足够优美,表达简单
- 关于加权和部分,计算方式聪明
class Vertex:
def __init__(self):
self.in_degree = 0
self.out_degree = 0
self.pagerank = 0.0
class Edge:
def __init__(self, start, end):
self.start_id = start
self.end_id = end
def addVertex(vertex_name, vtx_map):
'''
return the id of vertex_name
if exists in vtx_map, return directly
if not exists, add it, and then return
vertex_name: string, the url of a web page
vtx_map: dict, the map from url to id
'''
res_id = 0
if vertex_name in vtx_map:
return vtx_map[vertex_name]
else:
res_id = len(vtx_map)
vtx_map[vertex_name] = res_id
return res_id
def readTable(fname, vtx_map, edge_list):
'''
read fname line by line, update the vtx_map and edge_list
fname: string, the file name of the table
vtx_map: dict, the map from url to id
edge_list: list, the list of all edges
'''
with open(fname, 'r') as fin:
for line in fin.readlines():
tmp = line.strip().split('\t')
assert(len(tmp) == 2)
start = addVertex(tmp[0], vtx_map)
end = addVertex(tmp[1], vtx_map)
edge_list.append(Edge(start, end))
return None
def initialize(vtx_map, edge_list, vtx_list):
'''
initialize the data structures
vtx_map: dict, the map from url to id
edge_list: list, the list of all edges
vtx_list: list, the list of all vertices
'''
vtx_num = len(vtx_map)
assert(vtx_num > 0)
vtx_list = [Vertex() for _ in range(vtx_num)]
for i in range(vtx_num):
vtx_list[i].pagerank = 1.0 / vtx_num
for edge in edge_list:
vtx_list[edge.start_id].out_degree += 1
vtx_list[edge.end_id].in_degree += 1
return None
def calcPagerank(alpha, num_iter, vtx_map, edge_list):
'''
calc PageRank for all vertices
return: vtx_list, list, the list of all vertices
alpha: float, damping factor
num_iter: int, the upper limitation of calculation
vtx_map: dict, the map from url to id
edge_list: list, the list of all edges
'''
vtx_list, pr_list = list(), list()
initialize(vtx_map, edge_list, vtx_list)
vtx_num = len(vtx_list)
assert(vtx_num > 0)
alpha = float(alpha)
for _ in range(num_iter):
pr_list = [alpha / vtx_num for _ in range(vtx_num)]
# calc
for edge in edge_list:
pr_list += (1 - alpha) * vtx_list[edge.start_id].pagerank / \
vtx_list[edge.start_id].out_degree
# revise
revise = sum(map(lambda vtx: (1 - alpha) * vtx.pagerank / vtx_num, \
filter(lambda vtx:(vtx.out_degree == 0), vtx_list)))
# update
for i in range(vtx_num):
vtx_list[i].pagerank = pr_list[i] + revise
return vtx_list
def doCalcPageRank(fname = 'wt2g_inlinks.source', alpha = 0.15, num_iter = 30):
vtx_map = dict()
edge_list = list()
readTable(fname, vtx_map, edge_list)
return calcPagerank(alpha, num_iter, vtx_map, edge_list)
if __name__ == '__main__':
print doCalcPageRank()
相关程序说明:
-