1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
| class Vertex: def __init__(self): self.in_degree = 0 self.out_degree = 0 self.pagerank = 0.0
class Edge: def __init__(self, start, end): self.start_id = start self.end_id = end
def addVertex(vertex_name, vtx_map): ''' return the id of vertex_name if exists in vtx_map, return directly if not exists, add it, and then return
vertex_name: string, the url of a web page vtx_map: dict, the map from url to id ''' res_id = 0 if vertex_name in vtx_map: return vtx_map[vertex_name] else: res_id = len(vtx_map) vtx_map[vertex_name] = res_id return res_id
def readTable(fname, vtx_map, edge_list): ''' read fname line by line, update the vtx_map and edge_list
fname: string, the file name of the table vtx_map: dict, the map from url to id edge_list: list, the list of all edges ''' with open(fname, 'r') as fin: for line in fin.readlines(): tmp = line.strip().split('\t') assert(len(tmp) == 2) start = addVertex(tmp[0], vtx_map) end = addVertex(tmp[1], vtx_map) edge_list.append(Edge(start, end)) return None
def initialize(vtx_map, edge_list, vtx_list): ''' initialize the data structures
vtx_map: dict, the map from url to id edge_list: list, the list of all edges vtx_list: list, the list of all vertices ''' vtx_num = len(vtx_map) assert(vtx_num > 0) vtx_list = [Vertex() for _ in range(vtx_num)] for i in range(vtx_num): vtx_list[i].pagerank = 1.0 / vtx_num for edge in edge_list: vtx_list[edge.start_id].out_degree += 1 vtx_list[edge.end_id].in_degree += 1 return None
def calcPagerank(alpha, num_iter, vtx_map, edge_list): ''' calc PageRank for all vertices return: vtx_list, list, the list of all vertices
alpha: float, damping factor num_iter: int, the upper limitation of calculation vtx_map: dict, the map from url to id edge_list: list, the list of all edges ''' vtx_list, pr_list = list(), list() initialize(vtx_map, edge_list, vtx_list) vtx_num = len(vtx_list) assert(vtx_num > 0) alpha = float(alpha) for _ in range(num_iter): pr_list = [alpha / vtx_num for _ in range(vtx_num)] for edge in edge_list: pr_list += (1 - alpha) * vtx_list[edge.start_id].pagerank / \ vtx_list[edge.start_id].out_degree revise = sum(map(lambda vtx: (1 - alpha) * vtx.pagerank / vtx_num, \ filter(lambda vtx:(vtx.out_degree == 0), vtx_list))) for i in range(vtx_num): vtx_list[i].pagerank = pr_list[i] + revise return vtx_list
def doCalcPageRank(fname = 'wt2g_inlinks.source', alpha = 0.15, num_iter = 30): vtx_map = dict() edge_list = list() readTable(fname, vtx_map, edge_list) return calcPagerank(alpha, num_iter, vtx_map, edge_list)
if __name__ == '__main__': print doCalcPageRank()
|