import networkx as nx
import numpy as np
import scipy as sp
import scipy.cluster.vq as vq
import matplotlib.pyplot as plt
import math
import random
import operator
import itertools

################################################################################
# Read in the facebook datasets from HW2

G = nx.read_edgelist("facebook_combined.data", comments="%")

################################################################################
# Get our parameters to construct an Erdos-Renyi network of the same size/order

n = G.order()
m = G.size()
p = 2*m / (n*(n-1))
H = nx.erdos_renyi_graph(n, p)

# make sure our networks are similar
print(G.order())
print(G.size())
print(H.order())
print(H.size())

################################################################################
# Let's look at triangle counts in each network first. To avoid multiple counts
# of the same triangle, we'll ensure a sorted order to our count.

def count_triangles(G):
  count = 0
  for v in G.nodes():
    for u in G.neighbors(v):
      for w in G.neighbors(u):
        if v < u and u < w and G.has_edge(v,w):
          count += 1
  return count

print(count_triangles(G))
print(count_triangles(H))

################################################################################
# We can consider a larger cycle of size 4 as well. We'll need to start doing
# some basic optimization as we're increasing complexity. Note here that we're
# effectively counting non-induced subgraphs.

def count_4cycle(G):
  count = 0
  for v in G.nodes():
    for u in G.neighbors(v):
      if u < v:
        continue
      for w in G.neighbors(u):
        if w < u:
          continue
        for x in G.neighbors(w):
          if w < x and G.has_edge(v,x):
            count += 1
  return count

print(count_4cycle(G))
print(count_4cycle(H))

################################################################################
# Let's look at how we can modify the above to specifically consider induced
# subgraphs, and how it might affect our counts. Note, that are cycle should
# not have any 'chords', which are edges in between vertices on a cycle which
# are not edges comprising the cycle.

def count_4cycle_induced(G):
  count = 0
  for v in G.nodes():
    for u in G.neighbors(v):
      if u < v:
        continue
      for w in G.neighbors(u):
        if w < u or G.has_edge(v,w):
          continue
        for x in G.neighbors(w):
          if w < x and G.has_edge(v,x) and not G.has_edge(x,u):
            count += 1
  return count

# Notice the difference in counts in between our two networks when we consider
# induced vs. non-induced subgraphs. How can we explain that observation?
print(count_4cycle_induced(G))
print(count_4cycle_induced(H))


################################################################################
# Finally, let's consider counting stars. A k-star is an induced subgraph that
# is comprised of a central vertex and k edges connecting that vertex to k
# distinct neighbors. Our approach will be similar as above.

def count_k_stars(G, k):
  count = 0
  for v in G.nodes():
    print(v, G.degree(v))
    possible_stars = itertools.permutations(list(G.neighbors(v)),3)
    for s in possible_stars:
      if not all(int(s[i]) < int(s[i+1]) for i in range(len(s)-1)):
        continue
      is_star = True
      edges = itertools.permutations(s, 2)
      for e in edges:
        if G.has_edge(e[0], e[1]):
          is_star = False
          break
      if is_star:
        count += 1
  return count

print(count_k_stars(G, 3))
print(count_k_stars(H, 3))

# Note how long the above takes. If we instead wanted to count non-induced star
# embeddings, would we be able to do it in a shorter or longer amount of time?
# We could also try an approximate approach - what considerations would we need
# to include? Note that subgraph embeddings as rooted at a given vertex can be
# highly sensitive to degree, so simply sampling X% of vertices in G can result
# in large variance in error.