You can think of Connected Components in very layman’s terms as sort of a hard clustering algorithm which finds clusters/islands in related/connected data. As a concrete example: Say you have data about roads joining any two cities in the world. And you need to find out all the continents in the world and which city they contain.
from graphframes import * defvertices(line): vert = [int(x) for x in line.split(" ")] return vert vertices = adjacency_list.flatMap(lambda x: vertices(x)).distinct().collect() vertices = sqlContext.createDataFrame([[x] for x in vertices], ["id"]) defcreate_edges(line): a = [int(x) for x in line.split(" ")] edges_list=[] iflen(a)==1: edges_list.append((a[0],a[0])) for i inrange(0, len(a)-1): for j inrange(i+1 ,len(a)): edges_list.append((a[i],a[j])) edges_list.append((a[j],a[i])) return edges_list edges = adjacency_list.flatMap(lambda x: create_edges(x)).distinct().collect() edges = sqlContext.createDataFrame(edges, ["src", "dst"]) g = GraphFrame(vertices, edges) sc.setCheckpointDir(".") # graphframes uses the same paper we referenced apparently cc = g.connectedComponents() print cc.show()