Connected Components
You can think of Connected Components in very layman’s terms as sort of a hard clustering algorithm which finds clusters/islands in related/connected data. As a concrete example: Say you have data about roads joining any two cities in the world. And you need to find out all the continents in the world and which city they contain.
pyspark use graphFrame
from graphframes import *
def vertices(line):
vert = [int(x) for x in line.split(" ")]
return vert
vertices = adjacency_list.flatMap(lambda x: vertices(x)).distinct().collect()
vertices = sqlContext.createDataFrame([[x] for x in vertices], ["id"])
def create_edges(line):
a = [int(x) for x in line.split(" ")]
edges_list=[]
if len(a)==1:
edges_list.append((a[0],a[0]))
for i in range(0, len(a)-1):
for j in range(i+1 ,len(a)):
edges_list.append((a[i],a[j]))
edges_list.append((a[j],a[i]))
return edges_list
edges = adjacency_list.flatMap(lambda x: create_edges(x)).distinct().collect()
edges = sqlContext.createDataFrame(edges, ["src", "dst"])
g = GraphFrame(vertices, edges)
sc.setCheckpointDir(".")
# graphframes uses the same paper we referenced apparently
cc = g.connectedComponents()
print cc.show()