########################
# Supp. File 1. Characterization of Nested Complexes in PINs --R code
# Programming language: R v.2.15.2
# Programmer: Antonio Mora, antoniocmora@gmail.com
# Jan.-Feb., 2013
########################

#######################
# 0. Needed Libraries and Files:
#######################
setwd("**Introduce your working directory here**")
library(iRefR)							# iRefR can be installed from CRAN.

irefindex_curr_human = get_irefindex("9606", "current", getwd())	# 401140 records

# Assumption1: Only human-human interactions:
human_human_list = data.frame(irefindex_curr_human$taxa, irefindex_curr_human$taxb)
tmp = do.call(`paste`, c(unname(human_human_list), list(sep=".")))
irefindex_curr_hh = irefindex_curr_human[tmp=="taxid:9606(Homo sapiens).taxid:9606(Homo sapiens)" | tmp=="-.taxid:9606(Homo sapiens)",]							# 361059 records

# Assumption2: Removing intra-protein interactions:
irefindex_curr_hh = irefindex_curr_hh[-which(irefindex_curr_hh$edgetype=="X" & (as.character(irefindex_curr_hh[,1])==as.character(irefindex_curr_hh[,2]))==TRUE),]				# 347753 records

# Full PIN: Canonical, all confidence, multigraph:
edgeList_full = convert_MITAB_to_edgeList(irefindex_curr_hh, "default", "spoke", node_names="uids", multi_edge="yes")
PIN_full = convert_edgeList_to_graph(edgeList_full)
PIN_full								# 19482 nodes, 162254 edges

# Assumption3: Remove predicted data (exclude OPHID):
irefindex_curr_hh_exp = select_database("ophid", irefindex_curr_hh, "except_this_database")	# 289451 records

# Subset experimental: Canonical, all confidence, multigraph:
edgeList_exp = convert_MITAB_to_edgeList(irefindex_curr_hh_exp, "default", "spoke", node_names="uids", multi_edge="yes")
PIN_exp = convert_edgeList_to_graph(edgeList_exp)
PIN_exp									# 18383 nodes, 133783 edges

#######################
# 1. Generating complex data structures:
#######################
# Needed functions (additional to iRefR):
generate_known_complexes_uids = function(iRef_C) {
	# Generating complexList from known complexes:
	complex_name = list(); complexList = list()
	irigids = unique(iRef_C$irigid)
	for (i in irigids) {
		this_complex = sort(unique(iRef_C[which(iRef_C$irigid==i), "uidB"]))
		if (length(this_complex) > 1) {
			complex_name[[i]] = i
			complexList[[i]] = paste(this_complex, collapse=",", sep="")
		}
	}
	complexName_table = do.call(rbind, complex_name)
	complexList_table = do.call(rbind, complexList)
	output = cbind(complexName_table, complexList_table)
}

# Generating complex data structures:
iRef_C = select_interaction_type("complex", irefindex_curr_hh)
dim(iRef_C)							# 58854 x 54
complexList_C = generate_known_complexes_uids(iRef_C)
dim(complexList_C)						# 5816x2, 10'

edgeList_C = convert_MITAB_to_edgeList(iRef_C, canonical_rep="no", node_names="uids", multi_edge="yes")	# 2'
dim(edgeList_C)							# 37075x5
edges_one_subunit = NULL
for (i in unique(edgeList_C[,3])) {
	tmp = which(edgeList_C[,3]==i)
	if (length(tmp)==1) {
		edges_one_subunit = c(edges_one_subunit, tmp)
	}
}
edgeList_C = edgeList_C[-edges_one_subunit,]
dim(edgeList_C)							# 36703x5
# Note: Complexes with one only protein were removed.

PIN_C = convert_edgeList_to_graph(edgeList_C)
PIN_C								# 7397 nodes, 36703 edges

iRef_B = select_interaction_type("binary", irefindex_curr_hh)
dim(iRef_B)							# 287959 x 54

# Removing repeated complexes:
remove_repeated_complexes = function(complexes_table_tmp) {
	complexes_table = NULL
	each_complex = unique(complexes_table_tmp[,2])
	for (i in 1:length(each_complex)) {
		tmp = complexes_table_tmp[which(complexes_table_tmp[,2] == each_complex[i]),]
		if (is.vector(tmp)==T) {
			complexes_table = rbind(complexes_table, tmp)
		} else {
			tmp2 = cbind(paste(tmp[,1], collapse="|", sep=""), tmp[1,2])
			complexes_table = rbind(complexes_table, tmp2)
		}
	}
	rownames(complexes_table) = NULL
	complexes_table
}

complexList_C_unique = remove_repeated_complexes(complexList_C)
dim(complexList_C_unique)								# 5681 complexes

#######################
# 2. Quantifying overlap and nesting in iRefIndex complexes:
#######################
# Jaccard Matrix (Overlap quantification) and Meet-Min Matrix (Nesting quantification):

generate_overlap_nesting_matrices = function(complexList) {
	overlap = matrix(0,nrow=length(complexList), ncol=length(complexList)); jaccard_index = matrix(0,nrow=length(complexList), ncol=length(complexList)); meetmin_index = matrix(0,nrow=length(complexList), ncol=length(complexList))

	for (i in 1:(length(complexList)-1)) {
		complex_i = strsplit(complexList[i], ",", fixed=TRUE)[[1]]
		for (j in (i+1):length(complexList)) {
			complex_j = strsplit(complexList[j], ",", fixed=TRUE)[[1]]
			overlap[i,j] = length(intersect(complex_i, complex_j))
			jaccard_index[i,j] = overlap[i,j] / length(unique(complex_i, complex_j))
			meetmin_index[i,j] = overlap[i,j] / min(length(complex_i), length(complex_j))
		}
	}
	result = list(overlap=overlap, jaccard_index=jaccard_index, meetmin_index=meetmin_index)
}

tmp = generate_overlap_nesting_matrices(complexList_C_unique[,2])		# TIME: 1hr 10'
overlap_complex_complex = tmp$overlap
jaccard_complex_complex = tmp$jaccard_index
meetmin_complex_complex = tmp$meetmin_index

# Quantify all significant in half the matrix:					# TIME: 9'
count_over = 0; count_meetmin = 0; all_pairs = 0; positive_overlap = NULL; positive_meetmin = NULL
for (i in 1:(length(complexList_C_unique[,2])-1)) {
	for (j in (i+1):length(complexList_C_unique[,2])) {
		all_pairs = all_pairs + 1
		if (overlap_complex_complex[i,j] > 0) {
			positive_overlap = c(positive_overlap, overlap_complex_complex[i,j])
			count_over = count_over + 1
			positive_meetmin = c(positive_meetmin, meetmin_complex_complex[i,j])
			if (meetmin_complex_complex[i,j] == 1) {
				count_meetmin = count_meetmin + 1
			}
		}
	}
}
all_pairs						# 16134040
count_over						# 213626
count_meetmin						# 6275

# Histograms:
postscript("fig3.eps", horizontal=F)
	par(mfrow=c(2,1))
	hist(positive_overlap, breaks=100, main="(a)")
	hist(positive_meetmin, main="(b)")
dev.off()

#######################
# 3. Graphical representation of complex overlap and nesting:
#######################
# Detect some interesting nesting complexes from the edgeList:
edgeList_tmp = convert_MITAB_to_edgeList(iRef_C, multi_edge="no") #, node_names="uids"
chosen_int = 1
rigids_multi_edge = edgeList_tmp[grep("\\|", edgeList_tmp[,3])[[chosen_int]], 3]
rigids_multi_edge = strsplit(rigids_multi_edge, "|", fixed="TRUE")[[1]]
rigids_multi_edge

index_interesting_multi_edge = unlist(lapply(rigids_multi_edge, grep, edgeList_C[,3]))
edgeList_interesting_complex = edgeList_C[index_interesting_multi_edge,]
edgeList_interesting_complex

# Option 0: Simple merge: Information on nesting is lost:
graph_interesting = convert_edgeList_to_graph(edgeList_interesting_complex)
jpeg("nested_example0.jpg", width = 1024, height = 512)
	plot(simplify(graph_interesting, edge.attr.comb="ignore"), vertex.label=V(graph_interesting)$name, vertex.size=13, main="Hidden Nested complexes")
dev.off()

# Plotting option 1: Multi-edge plot:
plot_nest_example = function(edgeList_interesting_complex) {
	graph_interesting = convert_edgeList_to_graph(edgeList_interesting_complex)
	list_edge_colors = NULL
	list_edge_colors[which(edgeList_interesting_complex[,3]== "1024442")] = "red"
	list_edge_colors[which(edgeList_interesting_complex[,3]== "617191")] = "blue"
	list_edge_colors[which(edgeList_interesting_complex[,3]== "707306")] = "green"
	graph_interesting <- set.edge.attribute(graph_interesting, "color", value=list_edge_colors)
}

graph_interesting = plot_nest_example(edgeList_interesting_complex)
jpeg("nested_example1.jpg", width = 1024, height = 512)
	plot(graph_interesting, layout=layout.kamada.kawai, vertex.label=V(graph_interesting)$name, vertex.size=13, main="Nested complexes -Multi-edge view")
	legend(.7, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green"))
dev.off()

# Plotting option 2: Multi-node plot:
multiplicate_nodes = function(edgeList_interesting_complex, list_nodes, list_rigids) {
	for (i in 1:length(list_nodes)) {
		n = 0
		for (j in 1:length(list_rigids)) {
			tmp = which(edgeList_interesting_complex[,3] == list_rigids[j])
			if(edgeList_interesting_complex[tmp[1],1]==list_nodes[i]) {
				n = n + 1
				edgeList_interesting_complex[tmp,1] = paste(edgeList_interesting_complex[tmp[1],1], n, sep="_")
			}
			if(edgeList_interesting_complex[tmp[1],2]==list_nodes[i]) {
				n = n + 1
				edgeList_interesting_complex[tmp,2] = paste(edgeList_interesting_complex[tmp[1],2], n, sep="_")
			}
		}
	}
	edgeList_interesting_complex
}

# Multiplicating only hub node:
edgeList_interesting_case2 = multiplicate_nodes(edgeList_interesting_complex, "uniprotkb:Q9Y468", c("1024442", "617191", "707306"))

graph_interesting2 = plot_nest_example(edgeList_interesting_case2)
jpeg("nested_example2.jpg", width = 1024, height = 512)
	plot(layout.sugiyama(graph_interesting2, attributes="all")$extd_graph, vertex.label=V(graph_interesting2)$name, vertex.size=13, main="Nested complexes -Multi-hub view")
	legend(.7, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green"))
dev.off()

# Multiplicating all nodes in the graph:
multiplicate_all_nodes = function(edgeList_interesting_complex) {
	list_rigids = unique(edgeList_interesting_complex[,3])
	for (i in 1:length(list_rigids)) {
		tmp = which(edgeList_interesting_complex[,3] == list_rigids[i])
		edgeList_interesting_complex[tmp,1] = paste(edgeList_interesting_complex[tmp,1], i, sep="_")
		edgeList_interesting_complex[tmp,2] = paste(edgeList_interesting_complex[tmp[1],2], i, sep="_")
	}
	edgeList_interesting_complex
}

edgeList_interesting_case3 = multiplicate_all_nodes(edgeList_interesting_complex)
graph_interesting3 = plot_nest_example(edgeList_interesting_case3)
jpeg("nested_example3.jpg", width = 1024, height = 512)
	plot(graph_interesting3, vertex.label=V(graph_interesting3)$name, vertex.size=13, main="Nested complexes -Multi-node view")
	legend(.7, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green"))
dev.off()

# Plotting option 3: Pie nodes:
jpeg("nested_example4.jpg", width = 1024, height = 512)
	plot(simplify(graph_interesting, edge.attr.comb="ignore"), layout=layout.kamada.kawai, vertex.shape="pie", vertex.pie=list(c(.5,0,.5), c(.33,.33,.33),c(0,.5,.5),c(.33,.33,.33)), vertex.pie.color=list(c("red", "blue", "green")), vertex.label=V(graph_interesting)$name, vertex.size=24, main="Nested complexes -Pie nodes view")
	legend(.7, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green"))
dev.off()

postscript("fig4.eps", horizontal=F, fonts="serif")
	par(mfrow=c(3,2))
	plot(simplify(graph_interesting, edge.attr.comb="ignore"), vertex.label=gsub("uniprotkb:", "", V(graph_interesting)$name), vertex.size=50, vertex.color=NULL, vertex.label.cex=1, vertex.label.color="black", main="(a)")
	plot(graph_interesting, layout=layout.kamada.kawai, vertex.label=gsub("uniprotkb:", "", V(graph_interesting)$name), vertex.size=50, vertex.color=NULL, vertex.label.cex=1, vertex.label.color="black", main="(b)")
	#legend(.7, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green"))
	plot(layout.sugiyama(graph_interesting2, attributes="all")$extd_graph, vertex.label=gsub("uniprotkb:", "", V(graph_interesting2)$name), vertex.size=50, vertex.color=NULL, vertex.label.cex=1, vertex.label.color="black", main="(c)")
	#legend(.7, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green"))
	plot(graph_interesting3, vertex.label=gsub("uniprotkb:", "", V(graph_interesting3)$name), vertex.size=32, vertex.color=NULL, vertex.label.cex=0.6, vertex.label.color="black", main="(d)")
	#legend(.7, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green"))
	plot(simplify(graph_interesting, edge.attr.comb="ignore"), layout=layout.kamada.kawai, vertex.shape="pie", vertex.pie=list(c(.5,0,.5), c(.33,.33,.33),c(0,.5,.5),c(.33,.33,.33)), vertex.pie.color=list(c("red", "blue", "green")), vertex.label=gsub("uniprotkb:", "", V(graph_interesting)$name), vertex.size=50, vertex.label.cex=1.4, vertex.label.color="black", main="(e)")
	legend(.4, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green"))
dev.off()

#######################
# 4. Reconstruct list of nested cases
#######################
# Generate a table for each nested pair, using complexList format, and adding overlap and meetmin info:
# TIME: 2'
create_table_nested_complexList = function(complexList_C_unique, overlap_complex_complex, jaccard_complex_complex, meetmin_complex_complex) {
	table_nested_complexList = NULL
	for (i in 1:(dim(overlap_complex_complex)[1]-1)) {
		for (j in (i+1):dim(overlap_complex_complex)[1]) {
			if (meetmin_complex_complex[i,j] == 1) {
				longest_set = which.max(c(length(strsplit(complexList_C_unique[i,2], ",", fixed=T)[[1]]), length(strsplit(complexList_C_unique[j,2], ",", fixed=T)[[1]])))
				if (longest_set==1) {
					new_row = c(complexList_C_unique[i,1], complexList_C_unique[j,1], complexList_C_unique[i,2], complexList_C_unique[j,2], overlap_complex_complex[i,j], jaccard_complex_complex[i,j], meetmin_complex_complex[i,j])
				}
				if (longest_set==2) {
					new_row = c(complexList_C_unique[j,1], complexList_C_unique[i,1], complexList_C_unique[j,2], complexList_C_unique[i,2], overlap_complex_complex[i,j], jaccard_complex_complex[i,j], meetmin_complex_complex[i,j])
				}
				table_nested_complexList = rbind(table_nested_complexList, new_row)
			}
		}
	}

	rownames(table_nested_complexList) = NULL
	colnames(table_nested_complexList) = c("Macro complex -ID", "Nested complex -ID", "Macro complex -subunits", "Nested complex -subunits", "Overlap", "Jaccard", "Meet-Min") 
	dim(table_nested_complexList)					# 6275x7

	# Grooming the table:
	V2 = NULL
	for (i in 1:length(table_nested_complexList[,2])) {
		tmp = strsplit(table_nested_complexList[i,2], "|", fixed=T)[[1]]
		V2 = c(V2, sort(tmp)[1])
	}
	table_nested_complexList_final = as.data.frame(cbind(table_nested_complexList[,1], V2, table_nested_complexList[,3:4], table_nested_complexList[,6:7], weight=table_nested_complexList[,5]))
	# Note: Choosing one ID for complex and leaving overlap as weight.
}

table_nested_complexList_final = create_table_nested_complexList(complexList_C_unique, overlap_complex_complex, jaccard_complex_complex, meetmin_complex_complex)

# Levels of nesting --Summary of nested groups:
make_nested_groups = function(table_nested_complexList_final) {
	# DFS:
	graph_all_nesting = graph.data.frame(table_nested_complexList_final, directed="T")
	n=0; table_nested_groups = NULL
	for (i in unique(V(graph_all_nesting)$name)) {
		n=n+1
		nested_id = paste("NID", n, sep="_")
		tmp <- unique(graph.dfs(graph_all_nesting, root=which(V(graph_all_nesting)$name==i), "out", unreachable="F")$order)
		tmp2 = length(tmp)
		complex_group = V(graph_all_nesting)$name[tmp[-tmp2]]
		length_group = length(complex_group)
		nested_group = paste(complex_group, collapse=",", sep="")
		new_row = c(nested_id, nested_group, length_group)
		table_nested_groups = rbind(table_nested_groups, new_row)
	}

	# Remove subsets of others:
	complexList = table_nested_groups[,2]
	meetmin_index = matrix(0,nrow=length(complexList), ncol=length(complexList))
	complex_groups_to_remove = NULL
	for (i in 1:(length(complexList)-1)) {
		complex_i = strsplit(complexList[i], ",", fixed=TRUE)[[1]]
		for (j in (i+1):length(complexList)) {
			complex_j = strsplit(complexList[j], ",", fixed=TRUE)[[1]]
			overlap = length(intersect(complex_i, complex_j))
			meetmin_index = overlap / min(length(complex_i), length(complex_j))
			if (meetmin_index == 1) {
				if (length(complex_i) < length(complex_j)) {
					complex_groups_to_remove = c(complex_groups_to_remove, i)
				} else {
					complex_groups_to_remove = c(complex_groups_to_remove, j)
				}
			}
		}
	}

	table_nested_groups = table_nested_groups[-unique(complex_groups_to_remove),]
	new_ids = paste("NID", 1:dim(table_nested_groups)[1], sep="_")
	table_nested_groups[,1]= new_ids
	rownames(table_nested_groups) = NULL
	colnames(table_nested_groups) = c("nested_group", "nested_complexes", "group_size")

	# Generating graph objects for each group:
	graph_nested_comp = list()
	for (i in 1:dim(table_nested_groups)[1]) {
		tmp = strsplit(table_nested_groups[i,2], ",", fixed=T)[[1]]
		edgeList = table_nested_complexList_final[which((table_nested_complexList_final[,1] %in% tmp) & (table_nested_complexList_final[,2] %in% tmp)),]
		graph_nested_comp[[i]] = graph.data.frame(unique(edgeList), directed="T")
	}

	# Adding subunit info:
	a = c(as.character(table_nested_complexList_final[,1]), as.character(table_nested_complexList_final[,2]))
	b = c(as.character(table_nested_complexList_final[,3]), as.character(table_nested_complexList_final[,4]))
	c = unique(cbind(a, b))
	complex_to_prot = cbind(c[,1], gsub("uniprotkb:", "", c[,2]))			# 3073x2
	all_subunits = NULL; size_subunits = NULL
	for (i in 1:dim(table_nested_groups)[1]) {
		this_complex = strsplit(table_nested_groups[i,2], ",", fixed=T)[[1]]
		these_subunits = NULL; this_size = NULL
		for (j in this_complex) {
			tmp = complex_to_prot[which(complex_to_prot[,1]==j),2]
			these_subunits = c(these_subunits, tmp)
			this_size = c(this_size, length(strsplit(tmp, ",", fixed=T)[[1]]))
		}
		all_subunits = c(all_subunits, paste(these_subunits, collapse="|", sep=""))
		size_subunits = c(size_subunits, paste(this_size, collapse="|", sep=""))
	}
	table_nested_groups2 = cbind(table_nested_groups, all_subunits, size_subunits)

	output = list(table_nested_groups2, graph_nested_comp)
}

tmp = make_nested_groups(table_nested_complexList_final)	# TIME: 5'13''
table_nested_groups2 = tmp[[1]]
dim(table_nested_groups2)					# 1427x5
graph_nested_comp = tmp[[2]]
length(graph_nested_comp)					# 1427

hist(as.integer(table_nested_groups2[,3]),breaks=100)

# Plot and find motifs per nested group:
plot_nested_groups = function(nested_group_ID, table_nested_groups, graph_nested_comp, plotit="F", from="irefindex") {
	# complex-complex plot:
	if (plotit=="T") {
		if (from=="corum") {
			the_label = as.character(CORUM_2012[which(as.character(CORUM_2012[,1]) %in% V(graph_nested_comp[[nested_group_ID]])$name),2])
			postscript("fig2.eps", horizontal = F, fonts = "serif")
				plot(graph_nested_comp[[nested_group_ID]], vertex.label=the_label, vertex.size=40, vertex.label.color="black", vertex.label.cex=1.2)
			#jpeg(paste("nested_group_", nested_group_ID, "a.jpg", sep=""), width = 1024, height = 512)
				#plot(graph_nested_comp[[nested_group_ID]], vertex.label=the_label, vertex.size=24, main=paste("CORUM nested complexes, group", nested_group_ID, sep=" "))
			dev.off()
		} else {
			the_label = V(graph_nested_comp[[nested_group_ID]])$name
			jpeg(paste("nested_group_", nested_group_ID, "a.jpg", sep=""), width = 1024, height = 512)
				plot(graph_nested_comp[[nested_group_ID]], vertex.label=the_label, vertex.size=24, main=paste("iRefIndex nested complexes, group", nested_group_ID, sep=" "))
			dev.off()
		}
	}

	# prot-prot plot:
	if (from=="corum") {
		tmp = gsub("[|]", ",", table_nested_groups_CORUM[nested_group_ID, 4])
		tmp2 = unique(strsplit(tmp, ",", fixed=T)[[1]])
		list_proteins = paste("uniprotkb:", tmp2, sep="")
		g = convert_edgeList_to_graph(convert_MITAB_to_edgeList(iRef_C, node_names="uids", multi_edge="yes"))
		the_graph <- induced.subgraph(g, which(V(g)$name %in% list_proteins))
		V(the_graph)$name = gsub("uniprotkb:", "", V(the_graph)$name)
		jpeg(paste("nested_group_", nested_group_ID, "b.jpg", sep=""), width = 1024, height = 512)
			plot(the_graph, layout=layout.kamada.kawai, vertex.label=V(the_graph)$name, vertex.size=24, main=paste("CORUM nested complexes, group", nested_group_ID, sep=" "))
		dev.off()
	} else {
		list_complexes = strsplit(table_nested_groups[nested_group_ID, 2], ",", fixed=T)[[1]]
		this_edgeList_C = edgeList_C[which(edgeList_C[,3] %in% list_complexes),]
		if (is.vector(this_edgeList_C)==T) {this_edgeList_C = t(this_edgeList_C)}
		the_graph = convert_edgeList_to_graph(this_edgeList_C)
		V(the_graph)$name = gsub("uniprotkb:", "", V(the_graph)$name)
	}

	if (plotit=="T" & from=="irefindex") {
	jpeg(paste("nested_group_", nested_group_ID, "b.jpg", sep=""), width = 1024, height = 512)
	#plot_nest_group = function(the_graph) {
		list_colors_R = c("red", "blue", "green", "black", "orange", "yellow", "purple", "salmon", "grey", "pink", "brown", "violet", "navy", "khaki", "cyan", "magenta", "plum", "aquamarine", "chocolate", "steelblue", "tomato", "tan", "orchid", "ivory", "gold", "darkmagenta", "turquoise", "mistyrose", "skyblue", "palegreen", "palevioletred", "mintcream", "moccasin", "sienna", "maroon", "greenyellow", "darkred", "gainsboro", "coral", "bisque")
		n=0; list_edge_colors = NULL
		list_complexes = strsplit(table_nested_groups[nested_group_ID, 2], ",", fixed=T)[[1]]
		this_edgeList_C = edgeList_C[which(edgeList_C[,3] %in% list_complexes),]
		for (i in list_complexes) {
			n=n+1
			list_edge_colors[which(this_edgeList_C[,3] == i)] = list_colors_R[n]
		}
		the_graph <- set.edge.attribute(the_graph, "color", value=list_edge_colors)
	#}
		plot(the_graph, layout=layout.kamada.kawai, vertex.label=V(the_graph)$name, vertex.size=24, main=paste("iRefIndex nested complexes, group", nested_group_ID, sep=" "))
		legends = paste("complex_ID", list_complexes, sep="_")
		legend(.7, -1, legends, fill=list_colors_R[1:n])
	dev.off()
	}

	output = list(graph_nested_complexes = graph_nested_comp[[nested_group_ID]], graph_nested_prots = the_graph)
}

tmp = plot_nested_groups(421, table_nested_groups2, graph_nested_comp, "T")
tmp
tmp = plot_nested_groups(570, table_nested_groups2, graph_nested_comp, "T")
tmp

postscript("fig1.eps", horizontal=F, fonts="serif")
	par(mfrow=c(2,2))
	tmp = plot_nested_groups(421, table_nested_groups2, graph_nested_comp, "T")
	plot(tmp$graph_nested_complexes, layout=layout.kamada.kawai, vertex.label=gsub("uniprotkb:", "", V(tmp$graph_nested_complexes)$name), vertex.size=50, vertex.color=NULL, vertex.label.cex=1, vertex.label.color="black", main="(a)")
	plot(tmp$graph_nested_prots, layout=layout.kamada.kawai, vertex.label=gsub("uniprotkb:", "", V(tmp$graph_nested_prots)$name), vertex.size=50, vertex.color=NULL, vertex.label.cex=1, vertex.label.color="black", main="(b)")
	tmp = plot_nested_groups(570, table_nested_groups2, graph_nested_comp, "T")
	plot(tmp$graph_nested_complexes, layout=layout.kamada.kawai, vertex.label=gsub("uniprotkb:", "", V(tmp$graph_nested_complexes)$name), vertex.size=50, vertex.color=NULL, vertex.label.cex=1, vertex.label.color="black", main="(c)")
	plot(tmp$graph_nested_prots, layout=layout.kamada.kawai, vertex.label=gsub("uniprotkb:", "", V(tmp$graph_nested_prots)$name), vertex.size=40, vertex.color=NULL, vertex.label.cex=1, vertex.label.color="black", main="(d)")
dev.off()

#######################
# 5. Stating the nesting problem:
#######################
# Procedence of complexes of a nested group:
table_procedence = NULL
for (i in 1:dim(table_nested_groups2)[1]) {
	tmp = strsplit(table_nested_groups2[i,2], ",", fixed=T)[[1]]
	pmids=NULL; methods=NULL; dbs=NULL
	for (j in tmp) {
		this_pmid = paste(unique(iRef_C[which(iRef_C$irigid == j), "pmids"]), collapse="|")
		this_method = paste(unique(iRef_C[which(iRef_C$irigid == j), "method"]), collapse="|")
		this_db = paste(unique(iRef_C[which(iRef_C$irigid == j), "sourcedb"]), collapse="|")
		pmids = c(pmids, this_pmid)
		methods = c(methods, this_method)
		dbs = c(dbs, this_db)
	}
	new_row = c(table_nested_groups2[i,1:2], pmids=paste(pmids, collapse=","), methods=paste(methods, collapse=","), dbs=paste(dbs, collapse=","), length(unique(pmids))==1, length(unique(methods))==1, length(unique(dbs))==1)
	table_procedence = rbind(table_procedence, new_row)
}
rownames(table_procedence) = NULL
dim(table_procedence)						# 1427x8  TIME: 16'

# Analysis of procedence:
grep("x", table_procedence[,4])			# X-ray cases in 190 nested groups; mostly AP
groups_same_procedence = table_procedence[which(table_procedence[,6]==TRUE & table_procedence[,7]==TRUE & table_procedence[,8]==TRUE),]  # 189 cases where group comes from same paper-db
unique(groups_same_procedence[,7])		# 53 methods in these groups of nested complexes from same study
tmp = table_procedence[grep("CORUM", table_procedence[,5]),]   # 685 cases with CORUM info
which(tmp[,8]==TRUE)				# 71 cases with ONLY corum info

# Examples of biological validation:
CORUM_2012 = read.csv("allComplexes.csv", header=T, comment.char="", sep=';', quote="")		# http://mips.helmholtz-muenchen.de/genre/proj/corum/allComplexes.csv

count_overlap = function(overlap_complex_complex, jaccard_complex_complex, meetmin_complex_complex) {
	# Quantify all significant in half the matrix:
	count_over = 0; count_meetmin = 0; all_pairs = 0; positive_overlap = NULL; positive_meetmin = NULL
	for (i in 1:(dim(overlap_complex_complex)[1]-1)) {
		for (j in (i+1):dim(overlap_complex_complex)[1]) {
			all_pairs = all_pairs + 1
			if (overlap_complex_complex[i,j] > 0) {
				positive_overlap = c(positive_overlap, overlap_complex_complex[i,j])
				count_over = count_over + 1
				positive_meetmin = c(positive_meetmin, meetmin_complex_complex[i,j])
				if (meetmin_complex_complex[i,j] == 1) {
					count_meetmin = count_meetmin + 1
				}
			}
		}
	}
	result = list(all_pairs=all_pairs, count_over=count_over, count_meetmin=count_meetmin)
}

list_complexes = as.character(CORUM_2012[,5])	# 2867
list_complexes[which(list_complexes=="")] <- 0.0
CORUM_matrices = generate_overlap_nesting_matrices(list_complexes)
CORUM_overlap_complex = CORUM_matrices$overlap
CORUM_jaccard_complex = CORUM_matrices$jaccard_index
CORUM_meetmin_complex = CORUM_matrices$meetmin_index

tmp = count_overlap(CORUM_overlap_complex, CORUM_jaccard_complex, CORUM_meetmin_complex)
CORUM_all_pairs = tmp$all_pairs
CORUM_all_pairs						# 4108411
CORUM_count_over = tmp$count_over
CORUM_count_over					# 25532
CORUM_count_meetmin = tmp$count_meetmin
CORUM_count_meetmin					# 3053

new_list_complexes = cbind(as.character(CORUM_2012[,1]), list_complexes, as.character(CORUM_2012[,2]), as.character(CORUM_2012[,10]), as.character(CORUM_2012[,11]))
table_nested_complexList_CORUM = create_table_nested_complexList(new_list_complexes, CORUM_overlap_complex, CORUM_jaccard_complex, CORUM_meetmin_complex)            # note: corum accepts repeated complexes, see this table

tmp = make_nested_groups(table_nested_complexList_CORUM)	# TIME: 2'
table_nested_groups_CORUM = tmp[[1]]
dim(table_nested_groups_CORUM)					# 750x5
graph_nested_comp_CORUM = tmp[[2]]
length(graph_nested_comp_CORUM)					# 750

# Adding CORUM annotation info:
add_annotation = function(table_nested_groups_CORUM, new_list_complexes) {
	all_names = NULL; all_functional_annotation = NULL; all_diseases = NULL
	for (i in 1:dim(table_nested_groups_CORUM)[1]) {
		this_complex = strsplit(table_nested_groups_CORUM[i,2], ",", fixed=T)[[1]]
		this_name=NULL; this_functional_annotation=NULL; this_disease=NULL
		for (j in this_complex) {
			tmp = new_list_complexes[which(new_list_complexes[,1]==j),]
			this_name = c(this_name, tmp[3])
			this_functional_annotation = c(this_functional_annotation, tmp[4])
			this_disease = c(this_disease, tmp[5])
		}
		all_names = c(all_names, paste(this_name, collapse="|", sep=""))
		all_functional_annotation = c(all_functional_annotation, paste(this_functional_annotation, collapse="|", sep=""))
		all_diseases = c(all_diseases, paste(this_disease, collapse="|", sep=""))
	}
	table_annotated_CORUM = cbind(table_nested_groups_CORUM, all_names, all_functional_annotation, all_diseases)
}

table_annotated_CORUM = add_annotation(table_nested_groups_CORUM, new_list_complexes)
dim(table_annotated_CORUM)					# 750x8

# Case studies: Nuclear pore complex, DNA repair, cancer and biggest nested group:
pore_nested_group = table_annotated_CORUM[grep("pore", table_annotated_CORUM[,6]),c(1,6)]
pore_nested_group
cancer_nested_groups = table_annotated_CORUM[unique(c(grep("cancer", table_annotated_CORUM[,7]), grep("cancer", table_annotated_CORUM[,8]))),c(1,5,6,7,8)]
dim(cancer_nested_groups)					# 32x5
repair_nested_groups = table_annotated_CORUM[unique(c(grep("repair", table_annotated_CORUM[,6]), grep("repair", table_annotated_CORUM[,7]))),c(1,5,6,7)]
dim(repair_nested_groups)					# 27x4
biggest_nested_group = table_annotated_CORUM[which.max(table_annotated_CORUM[,3]),c(1,3,6,7)]

plot_pore = plot_nested_groups(43, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum")
plot_pore

plot_cancer1 = plot_nested_groups(150, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum")
plot_cancer1
plot_cancer2 = plot_nested_groups(14, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum")
plot_cancer2
plot_cancer3 = plot_nested_groups(15, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum")
plot_cancer3
plot_cancer4 = plot_nested_groups(24, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum")
plot_cancer4

plot_repair1 = plot_nested_groups(88, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum")
plot_repair1
plot_repair2 = plot_nested_groups(118, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum")
plot_repair2
plot_repair3 = plot_nested_groups(150, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum")
plot_repair3
plot_repair4 = plot_nested_groups(159, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum")
plot_repair4

plot_biggest = plot_nested_groups(30, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum")
plot_biggest

save.image("Project1.RData")

########################
# Part B
########################

#######################
# 0. Needed Libraries and Files:
#######################
load("Project1.RData")

#######################
# 1. Intuitive/naive criteria:
#######################
# 1.1. High Jaccard index is related to artifacts (mistakes):
# High Jaccard in Corum less than high Jaccard in all nested groups (suggest integration problem)
find_groups_two_complex_high_Jaccard = function(table_nested_groups2) {
	groups_two_complex = table_nested_groups2[which(table_nested_groups2[,3]==2),]
	jaccard = NULL
	for (i in 1:dim(groups_two_complex)[1]) {
		tmp = strsplit(groups_two_complex[i,4], "|", fixed=T)[[1]]
		vector1 = strsplit(tmp[1], ",", fixed=T)[[1]]
		vector2 = strsplit(tmp[2], ",", fixed=T)[[1]]
		jaccard = c(jaccard, length(intersect(vector1, vector2)) / length(unique(c(vector1, vector2))))
	}
	groups_two_complex = data.frame(groups_two_complex, as.numeric(jaccard))
}

groups_two_complex = find_groups_two_complex_high_Jaccard(table_nested_groups2)
length(which(groups_two_complex[,6] > 0.75))		# 55 cases of only one subcomplex and high Jaccard
groups_two_complex_CORUM = find_groups_two_complex_high_Jaccard(table_nested_groups_CORUM)
length(which(groups_two_complex_CORUM[,6] > 0.75))	# 51 cases of only one subcomplex and high Jaccard
high_Jaccard_CORUM = groups_two_complex_CORUM[which(groups_two_complex_CORUM[,6] > 0.75),]
# Almost all cases are in CORUM, disproving hypothesis.

# High Jaccard come from two different PMIDs or databases (suggest integration problem)
x = NULL; y = NULL; z = NULL
for (i in 50:100) {
	j = i/100
	high_Jaccard = groups_two_complex[which(groups_two_complex[,6] > j),]
	if (is.vector(high_Jaccard)==F) {
		high_Jaccard_diff_procedence = high_Jaccard[which(high_Jaccard[,1] %in% table_procedence[which(table_procedence[,6]==F | table_procedence[,8]==F), 1]),]
		size_Jaccard = dim(high_Jaccard)[1]
		size_diff_procedence = dim(high_Jaccard_diff_procedence)[1]
	} else {
		if (as.character(high_Jaccard[1]) %in% table_procedence[which(table_procedence[,6]==F | table_procedence[,8]==F), 1]) {	size_diff_procedence = 1 } else { size_diff_procedence = 0 }
		if (length(high_Jaccard)>0) { size_Jaccard = 1 } else { size_Jaccard = 0 }
	}
	x = c(x, j)
	y = c(y, size_diff_procedence)
	z = c(z, size_Jaccard)
}
jpeg("Jaccard_analysis.jpg", width = 1024, height = 512)
	plot(x, y, xlab="Jaccard index", ylab="Number of size-2 nested groups", col="red")
	lines(x, z, col="blue")
dev.off()
postscript("fig5.eps", horizontal = F, fonts = "serif")
	plot(x, y, xlab="Jaccard index", ylab="Number of size-2 nested groups", col="red")
	lines(x, z, col="blue")
dev.off()

# High number of subcomplexes (possibly overlapping) is related to real subcomplexes:
# High subcomplexes in gold standard (CORUM) compared to all nested groups
groups_multiple_complex = table_nested_groups2[which(table_nested_groups2[,3]>3),]
groups_multiple_complex_CORUM = table_nested_groups_CORUM[which(table_nested_groups_CORUM[,3]>3),]
population = dim(table_nested_groups2)[1]				# 1427
success_population = dim(groups_multiple_complex)[1]			# 398
sample = dim(table_nested_groups_CORUM)[1]				# 750
success_sample = dim(groups_multiple_complex_CORUM)[1]			# 170
# 170 out of 750 (22.7%) vs 398 out of 1427 (27.9%) (CORUM is less enriched in multiple)
pvalue_numbers = phyper(length(success_sample)-1, length(success_population), length(population) - length(success_population), length(sample), lower.tail=FALSE)			# 1 (not-significant)
# High number of subcomplexes is not a feature of gold standard and, therefore, may not be related to real subcomplexes.

# High number subcomplexes come from same PMID??
high_number_diff_procedence = groups_multiple_complex[which(groups_multiple_complex[,1] %in% table_procedence[which(table_procedence[,6]==F | table_procedence[,8]==F), 1]),]		# 374 out of 398
# High number subcomplexes come from diff PMID. This is argument pro-mistake, not pro-real nest.

#######################
# 1. Analysis of degree, centralities, clustering coefficient and cancer-related status, in nested groups:
#######################
# Compute BC of the whole metwork. Then, per pair maximal complex-subcomplex, find BC of the shared and non-shared proteins inside the whole network. Make statistics tests to see if nested or non-nested proteins happen to have different BC. I will get a conclusion per maximal-subcomplex pair, and then I might make a general distribution to conclude if it is really relevant.

edgeList_PIN = convert_MITAB_to_edgeList(irefindex_curr_hh, canonical_rep="no", node_names="uids", multi_edge="yes")
dim(edgeList_PIN)							# 162254x5
# Note: Complexes with one only protein were not removed.

PIN_full = convert_edgeList_to_graph(edgeList_PIN)
PIN_full								# 19482 nodes, 162254 edges

network_analysis_nested_group = function(PIN_full, table_nested_complexList_final, cancer_related_prots) {
	bc_full = betweenness(PIN_full); pr_full = page.rank(PIN_full)$vector
	deg_full = igraph::degree(PIN_full)
	pvalue_bc = NULL; pvalue_pr = NULL; pvalue_deg = NULL; pvalue_cc = NULL; hg_score_cancer = NULL
	for (i in 1:dim(table_nested_complexList_final)[1]) {
		vector_complex = strsplit(as.character(table_nested_complexList_final[i,3]), ",", fixed=T)[[1]]
		vector_subcomplex = strsplit(as.character(table_nested_complexList_final[i,4]), ",", fixed=T)[[1]]
		shared_prots = intersect(vector_complex, vector_subcomplex)
		non_shared_prots = setdiff(vector_complex, vector_subcomplex)

		vector_degree_shared = deg_full[which(names(deg_full) %in% shared_prots)]
		vector_degree_non_shared = deg_full[which(names(deg_full) %in% non_shared_prots)]
		wtest_deg = wilcox.test(vector_degree_shared, vector_degree_non_shared)
		pvalue_deg = c(pvalue_deg, wtest_deg$p.value)			# (<0.05 different, >0.05 the same)

		vector_betweenness_shared = bc_full[which(names(bc_full) %in% shared_prots)]
		vector_betweenness_non_shared = bc_full[which(names(bc_full) %in% non_shared_prots)]
		wtest_bc = wilcox.test(vector_betweenness_shared, vector_betweenness_non_shared)
		pvalue_bc = c(pvalue_bc, wtest_bc$p.value)			# (<0.05 different, >0.05 the same)

		vector_prank_shared = pr_full[which(names(pr_full) %in% shared_prots)]
		vector_prank_non_shared = pr_full[which(names(pr_full) %in% non_shared_prots)]
		wtest_pr = wilcox.test(vector_prank_shared, vector_prank_non_shared)
		pvalue_pr = c(pvalue_pr, wtest_pr$p.value)			# (<0.05 different, >0.05 the same)

		vector_cc_shared = transitivity(PIN_full, type="local", vids=which(V(PIN_full)$name %in% shared_prots))
		vector_cc_non_shared = transitivity(PIN_full, type="local", vids=which(V(PIN_full)$name %in% non_shared_prots))
		index_nan_shared = which(vector_cc_shared=="NaN")
		index_nan_non_shared = which(vector_cc_non_shared=="NaN")
		vector_cc_shared[index_nan_shared] = rep(0, length(index_nan_shared))
		vector_cc_non_shared[index_nan_non_shared] = rep(0, length(index_nan_non_shared))
		wtest_cc = wilcox.test(vector_cc_shared, vector_cc_non_shared)
		pvalue_cc = c(pvalue_cc, wtest_cc$p.value)			# (<0.05 different, >0.05 the same)

		all_prots = unique(c(shared_prots, non_shared_prots))
		cancer_shared_prots = shared_prots %in% cancer_related_prots
		cancer_all_prots = all_prots %in% cancer_related_prots
		#total_number_prots = population = N
		#cancer_all_prots = success_population = m
		#shared_prots = sample = n
		#cancer_shared_prots = success_sample = k
		hg_score_cancer = c(hg_score_cancer, phyper(length(cancer_shared_prots)-1, length(cancer_all_prots), length(all_prots) - length(cancer_all_prots), length(shared_prots), lower.tail=FALSE))
	}
	table_analysis_nested_groups = cbind(table_nested_complexList_final, pvalue_deg, pvalue_bc, pvalue_pr, pvalue_cc, hg_score_cancer)
}

table_analysis_nested_groups = network_analysis_nested_group(PIN_full, table_nested_complexList_final, cancer_related_prots)

#hist(table_analysis_nested_groups[,5])
#hist(table_analysis_nested_groups[,6])
jpeg("histogram_P2_1.jpg", width = 1024, height = 512)
	hist(table_analysis_nested_groups[,8], breaks=100, xlab="p-value", main="p-values Degree")
dev.off()
jpeg("histogram_P2_2.jpg", width = 1024, height = 512)
	hist(table_analysis_nested_groups[,9], breaks=100, xlab="p-value", main="p-values BC")
dev.off()
jpeg("histogram_P2_3.jpg", width = 1024, height = 512)
	hist(table_analysis_nested_groups[,10], breaks=100, xlab="p-value", main="p-values PR centrality")
dev.off()
jpeg("histogram_P2_4.jpg", width = 1024, height = 512)
	hist(table_analysis_nested_groups[,11], breaks=100, xlab="p-value", main="p-values Clust.Coeff.")
dev.off()
postscript("fig6.eps", horizontal=F, fonts="serif")
	par(mfrow=c(2,2))
	hist(table_analysis_nested_groups[,8], breaks=100, xlab="p-value -degree", main="(a)")
	hist(table_analysis_nested_groups[,9], breaks=100, xlab="p-value -BC", main="(b)")
	hist(table_analysis_nested_groups[,10], breaks=100, xlab="p-value -PRC", main="(c)")
	hist(table_analysis_nested_groups[,11], breaks=100, xlab="p-value -CC", main="(d)")
dev.off()

#######################
# 2. Adding motif information:
#######################
# Adding motif (NeMo) information:
# 2-vertex motif:
trivial_motifs = table_nested_groups2[which(table_nested_groups2[,3]=="2"),]
dim(trivial_motifs)					# 669x5

# Select 3,4 and 5-vertex cases to be analyzed by NeMo:
table_nested_groups_motif = table_nested_groups2[-which(table_nested_groups2[,3]=="2" | as.integer(table_nested_groups2[,3])>5),]
unique(table_nested_groups_motif[,3])
dim(table_nested_groups_motif)				# 510x5

# Find motifs:
add_motifs = function(table_nested_groups_motif, edgeList_C, plot_type="complex") {
	library(NeMo)

	table_motifs = NULL
	for (i in 1:dim(table_nested_groups_motif)[1]) {
		list_complexes = strsplit(table_nested_groups_motif[i, 2], ",", fixed=T)[[1]]
		this_edgeList_C = edgeList_C[which(edgeList_C[,3] %in% list_complexes),]
		if (dim(this_edgeList_C)[1]>0) {
			file_name = paste("edgeList", table_nested_groups_motif[i, 1], "txt", sep=".")
			write.table(this_edgeList_C[,1:2], file=file_name, sep=" ", row.names=F)

		# Compute the p-value of all motifs (by default EDD model)
			for (j in 3:5) {
				pv <- getExceptMotif(file_name, j)

				count_found_motifs = pv$counts[pv$counts > 0]
				index_found_motifs = which(pv$counts > 0)
				index_significant_motifs = which(pv$P.value < 0.05)
				count_significant_motifs = length(index_significant_motifs)
				n=0; all_motifs=NULL
				for (k in index_found_motifs) {
					n = n+1
					motif_matrix = pv$motifs[k][[1]]
					motif_index = which(motif_matrix==1, arr.ind=T)
					the_motif = paste("motif_", n, " = ", paste(paste(motif_index[,1], motif_index[,2], sep=","), collapse="|"), sep="")    # remove duplicated
					all_motifs = c(all_motifs, the_motif)
					tmp = cbind(NID=table_nested_groups_motif[i,1], motif_size=j, all_motifs, count_found_motifs, count_significant_motifs)
					table_motifs = rbind(table_motifs, tmp)
					#graph_found_motif <- graph.adjacency(pv$motifs[[k]], mode="directed")
				}
			}
		}
	}
	rownames(table_motifs)=NULL
	table_motifs
}

table_motifs = add_motifs(table_nested_groups_motif, edgeList_C)   # WARNINGS: Problems with Polya parameters.
dim(table_motifs)				# 64027x5
table_motifs_significant0 = table_motifs[which(as.integer(table_motifs[,5]) > 0),]
dim(table_motifs_significant0)			# 46088 (72%)
#table_motifs_significant = table_motifs[which(as.integer(table_motifs[,5]) > 15),]
#dim(table_motifs_significant)			# 18806x5 (29.4%)

jpeg("histogram_P2_5.jpg", width = 1024, height = 512)
	hist(as.integer(table_motifs[,5]), breaks=15)	# slightly bimodal around 0 and 20
dev.off()
hist(as.integer(table_motifs[,2]))		# 5-motifs, then 4-motifs, then 3-motifs

hist(as.integer(table_motifs_significant[,5]), breaks=6)
table(table_motifs_significant[,2])		# all common enriched are 5-motifs

extract_table_motifs = function(table_motifs) {
	the_table = table(table_motifs[,3])
	the_names = names(the_table)
	edited_names = gsub("motif_[0-9]+ = ", "", the_names)
	new_edited_names = NULL
	for (i in edited_names) {
		tmp = strsplit(i, "|", fixed=T)[[1]]
		new_tmp = NULL
		for (j in tmp) {
			tmp2 = sort(strsplit(j, ",", fixed=T)[[1]])
			new_tmp = c(new_tmp, paste(tmp2, collapse=","))
		}
		new_edited_names = c(new_edited_names, paste(unique(new_tmp), collapse="|"))
	}
	names(the_table) = new_edited_names
	sort(the_table, decreasing=T)
}
extract_table_motifs(table_motifs)			# imperfect 5-cliques are the most common motif
extract_table_motifs(table_motifs_significant0)		# imperfect 5-cliques are the most common motif; examples:
# 1,2|1,3|1,4|1,5|2,3|2,4|3,4|3,5|4,5|	imperfect clique (missing edge 2,5):	2320
# 1,2|1,3|1,4|1,5|2,3|2,4|3,4|3,5|	imperfect clique (missing edges 2,5 and 4,5):  980
# 1,2|1,3|1,4|1,5|2,3|2,4|2,5|3,4|3,5|4,5|	full clique: 348 
# This is evidence that nesting can be related to the concept of co-graphs, such as overlaps.
# 669+510=1179 studied cases (82.6% of all cases; the top 17.4% was not studied).

# Checking some over-represented motifs:
table_motifs[which(table_motifs[,1]=="NID_22"),]
tmp = plot_nested_groups(22, table_nested_groups_motif, graph_nested_comp, "T")
tmp
plot(tmp$graph_nested_complexes)
plot(tmp$graph_nested_prots)
table_motifs[which(table_motifs[,1]=="NID_22"),]

######################
# 3. Modularity measures:
######################
create_graph_lists = function(table_nested_groups2, graph_nested_comp, PIN_full) {
	vector_list_prots = NULL; tmp1 = list(); tmp2 = list()
	for (i in 1:dim(table_nested_groups2)[1]) {
		tmp = plot_nested_groups(i, table_nested_groups2, graph_nested_comp, "F")
		tmp1[[i]] = tmp$graph_nested_prots
		list_prots = strsplit(table_nested_groups2[i,4], "|", fixed=T)[[1]][1]
		vector_list_prots = c(vector_list_prots, list_prots)
		ids = which(gsub("uniprotkb:", "", V(PIN_full)$name) %in% strsplit(list_prots, ",", fixed=T)[[1]])
		tmp2[[i]] = induced.subgraph(PIN_full, ids)
	}
	output = list(tmp1, tmp2, vector_list_prots)
}

tmp = create_graph_lists(table_nested_groups2, graph_nested_comp, PIN_full)
tmp1 = tmp[[1]]
tmp2 = tmp[[2]]
vector_list_prots = tmp[[3]]

modularity_analysis = function(table_nested_groups2, tmp1, tmp2) {
	vector_eb_mod_C = NULL; vector_walk_mod_C = NULL; vector_label_mod_C = NULL; vector_eigen_mod_C = NULL
	vector_eb_number_C = NULL; vector_walk_number_C = NULL; vector_label_number_C = NULL; vector_eigen_number_C = NULL
	vector_eb_mod_all = NULL; vector_walk_mod_all = NULL; vector_label_mod_all = NULL; vector_eigen_mod_all = NULL
	vector_eb_number_all = NULL; vector_walk_number_all = NULL; vector_label_number_all = NULL; vector_eigen_number_all = NULL
	for (i in 1:dim(table_nested_groups2)[1]) {
		if (length(V(tmp1[[i]]))>0) {
			eb_community = edge.betweenness.community(tmp1[[i]])
			vector_eb_number_C = c(vector_eb_number_C, max(eb_community$membership))
			vector_eb_mod_C = c(vector_eb_mod_C, max(eb_community$modularity))

			walk_community = walktrap.community(tmp1[[i]])
			vector_walk_number_C = c(vector_walk_number_C, max(walk_community$membership))
			vector_walk_mod_C = c(vector_walk_mod_C, max(walk_community$modularity))

			label_community = label.propagation.community(tmp1[[i]])
			vector_label_number_C = c(vector_label_number_C, max(label_community$membership))
			vector_label_mod_C = c(vector_label_mod_C, max(label_community$modularity))

#			eigen_community = tryCatch(leading.eigenvector.community(tmp1[[i]]), error=function(e) NULL )
#			vector_eigen_number_C = c(vector_eigen_number_C, max(eigen_community$membership))
#			vector_eigen_mod_C = c(vector_eigen_mod_C, max(eigen_community$modularity))

			eb_community = edge.betweenness.community(tmp2[[i]])
			vector_eb_number_all = c(vector_eb_number_all, max(eb_community$membership))
			vector_eb_mod_all = c(vector_eb_mod_all, max(eb_community$modularity))

			walk_community = walktrap.community(tmp2[[i]])
			vector_walk_number_all = c(vector_walk_number_all, max(walk_community$membership))
			vector_walk_mod_all = c(vector_walk_mod_all, max(walk_community$modularity))

			label_community = label.propagation.community(tmp2[[i]])
			vector_label_number_all = c(vector_label_number_all, max(label_community$membership))
			vector_label_mod_all = c(vector_label_mod_all, max(label_community$modularity))

#			eigen_community = leading.eigenvector.community(tmp2[[i]])
#			vector_eigen_number_all = c(vector_eigen_number_all, max(eigen_community$membership))
#			vector_eigen_mod_all = c(vector_eigen_mod_all, max(eigen_community$modularity))
		} else {
			vector_eb_number_C = c(vector_eb_number_C, "NA")
			vector_eb_mod_C = c(vector_eb_mod_C, "NA")
			vector_walk_number_C = c(vector_walk_number_C, "NA")
			vector_walk_mod_C = c(vector_walk_mod_C, "NA")
			vector_label_number_C = c(vector_label_number_C, "NA")
			vector_label_mod_C = c(vector_label_mod_C, "NA")

			vector_eb_number_all = c(vector_eb_number_all, "NA")
			vector_eb_mod_all = c(vector_eb_mod_all, "NA")
			vector_walk_number_all = c(vector_walk_number_all, "NA")
			vector_walk_mod_all = c(vector_walk_mod_all, "NA")
			vector_label_number_all = c(vector_label_number_all, "NA")
			vector_label_mod_all = c(vector_label_mod_all, "NA")
		}
	}

	table_modularity = cbind(table_nested_groups2[,1], vector_list_prots, vector_eb_number_C, vector_eb_number_all, vector_eb_mod_C, vector_eb_mod_all, vector_walk_number_C, vector_walk_number_all, vector_walk_mod_C, vector_walk_mod_all, vector_label_number_C, vector_label_number_all, vector_label_mod_C, vector_label_mod_all)
	colnames(table_modularity) = c("Group_ID", "Proteins", "#Comms -edge_betw_C", "#Comms -edge_betw_all", "Modul -edge_betw_C", "Modul -edge_betw_all", "#Comms -walk_C", "#Comms -walk_all", "Modul -walk_C", "Modul -walk_all", "#Comms -label_C", "#Comms -label_all", "Modul -label_C", "Modul -label_all")

	table_modularity
}							# TIME: 4'

table_modularity = modularity_analysis(table_nested_groups2, tmp1, tmp2)  # all give results, not C
table_modularity_all = table_modularity[,c(1,2,4,6,8,10,12,14)]

jpeg("histogram_P2_6.jpg", width = 1024, height = 512)
	hist(as.numeric(table_modularity_all[,4]), main="Mod -edge betweenness method", xlab="Modularity", breaks=100)
dev.off()
jpeg("histogram_P2_7.jpg", width = 1024, height = 512)
	hist(as.numeric(table_modularity_all[,6]), main="Mod -walk trap method", xlab="Modularity", breaks=100)
dev.off()
jpeg("histogram_P2_8.jpg", width = 1024, height = 512)
	hist(as.numeric(table_modularity_all[,8]), main="Mod -label propagation method", xlab="Modularity", breaks=100)
dev.off()

# Overlap between communities and subcomplexes:
tmp = table_modularity_all[which(table_modularity_all[,4] != "NA"),]
eb_group = tmp[which(as.numeric(tmp[,4]) > 0.5),]
tmp = table_modularity_all[which(table_modularity_all[,6] != "NA"),]
walk_group = tmp[which(as.numeric(tmp[,6]) > 0.5),]
tmp = table_modularity_all[which(table_modularity_all[,8] != "NA"),]
label_group = tmp[which(as.numeric(tmp[,8]) > 0.5),]	# 3 cases of good modularity: Nested groups 291, 771, 876.
# They fail to detect modules.

overlap_community_subcomplex = function(tmp2, group_id) {
	list_a = list(); list_a2 = list()
	a = membership(label.propagation.community(tmp2[[group_id]]))
	# table(a)
	for (i in 1:max(a)) {
		tmp = a[a==i]
		list_a[[i]] = paste(sort(gsub("uniprotkb:","",names(tmp))), collapse=",", sep="")
	}
	list_a2 = as.list(strsplit(table_nested_groups2[group_id,4], "|", fixed=T)[[1]])
	list_a2 = list_a2[-1]
	coincidences = NULL
	for (i in 1:length(list_a2)) {
		for (j in 1:length(list_a)) {
			#jaccard = intersect() / unique(c())
			if (list_a2[[i]]==list_a[[j]]) {coincidences = c(coincidences, list_a2[i])}
		}
	}
	TP = length(coincidences)
	FP = length(list_a)-length(coincidences)
	FN = length(list_a2)-length(coincidences)
	output = list(communities=length(list_a), subcomplexes=length(list_a2), coincidences=coincidences, TP=TP, FP=FP, FN=FN, PRECISION=TP/(TP+FP), FDR=FP/(FP+TP))
}


table_overlap_community_subcomplex = NULL
for (i in 1:length(tmp2)) {
	example = overlap_community_subcomplex(tmp2, i)
	if (length(example)>0) {
		new_row = c(table_nested_groups2[i,1], example$communities, example$subcomplexes, example$TP, example$PRECISION, example$FDR)
	} else {
		new_row = c(table_nested_groups2[i,1], NA, NA, NA, NA, NA)
	}
	table_overlap_community_subcomplex = rbind(table_overlap_community_subcomplex, new_row)
}
rownames(table_overlap_community_subcomplex) = NULL
colnames(table_overlap_community_subcomplex) = c("nested_group", "communities", "subcomplexes", "TP", "Precision", "FDR")

jpeg("histogram_P2_9.jpg", width = 1024, height = 512)
	hist(as.numeric(table_overlap_community_subcomplex[,5]), main="Precision -label propagation method", xlab="Precision", breaks=100)
dev.off()
x11(); hist(as.numeric(table_overlap_community_subcomplex[,6]), breaks=100)
# Conclusion: Very few cases of good prediction (precision and FDR).

tmp = table_modularity_all[which(table_modularity_all[,4] != "NA"),]
table_overlap_community_subcomplex_mod1 = table_overlap_community_subcomplex[which(as.numeric(tmp[,4]) > 0.5),]
table_overlap_community_subcomplex_mod2 = table_overlap_community_subcomplex[which(as.numeric(tmp[,3]) > 4 & as.numeric(tmp[,4]) > 0.4),]
table_overlap_community_subcomplex_mod3 = table_overlap_community_subcomplex[which(as.numeric(tmp[,3]) > 12 & as.numeric(tmp[,4]) > 0.4),]
table_overlap_community_subcomplex_mod4 = table_overlap_community_subcomplex[which(as.numeric(tmp[,4]) > 0.2 & as.numeric(tmp[,4]) < 0.3),]

library(plotrix)
multhist(list(as.numeric(table_overlap_community_subcomplex_mod2[,5]), as.numeric(table_overlap_community_subcomplex_mod3[,5]), as.numeric(table_overlap_community_subcomplex_mod4[,5])), beside=T)
# In all cases (more modular or not), precision is always more bad than good. Conclusion: Not related to quality or quantity of modules. Just uniformly bad.

# Some examples:
ex1 = overlap_community_subcomplex(tmp2, 291)		# 1 out of 3 subcomplexes detected; additional 9 over-predicted
ex2 = overlap_community_subcomplex(tmp2, 771)		# 0
ex3 = overlap_community_subcomplex(tmp2, 876)		# 1

tmp = table_modularity_all[which(table_modularity_all[,4] != "NA"),]
eb_group = tmp[which(as.numeric(tmp[,3]) > 4 & as.numeric(tmp[,4]) > 0.4),]	# 103,291,876,959,1129,1387
tmp = table_modularity_all[which(table_modularity_all[,6] != "NA"),]
walk_group = tmp[which(as.numeric(tmp[,5]) > 4 & as.numeric(tmp[,6]) > 0.4),]	# 291,560,650,771,789,1116,1129,1403
tmp = table_modularity_all[which(table_modularity_all[,8] != "NA"),]
label_group = tmp[which(as.numeric(tmp[,7]) > 4 & as.numeric(tmp[,8]) > 0.4),]	# 291,560,650,771,1116,1129,1249,1403

ex4 = overlap_community_subcomplex(tmp2, 103)		# 1
ex5 = overlap_community_subcomplex(tmp2, 560)		# 1
ex6 = overlap_community_subcomplex(tmp2, 650)		# 0
ex7 = overlap_community_subcomplex(tmp2, 789)		# 1
ex8 = overlap_community_subcomplex(tmp2, 959)		# 0
ex9 = overlap_community_subcomplex(tmp2, 1116)		# 0
ex10 = overlap_community_subcomplex(tmp2, 1129)		# 0
ex11 = overlap_community_subcomplex(tmp2, 1249)		# 1
ex12 = overlap_community_subcomplex(tmp2, 1387)		# 1
ex13 = overlap_community_subcomplex(tmp2, 1403)		# 1

######################
# 4. GO analysis:
######################
# Use GO comparisons such as co-localization:

library(org.Hs.eg.db)
library(GO.db)
library(annotate)
load("eg2uid.RData")
x <- org.Hs.egGO
mapped_genes <- mappedkeys(x)	# Get the entrez gene identifiers that are mapped to a GO ID
xx <- as.list(x[mapped_genes])	# Convert to a list

all_shared_prots = unique(strsplit(paste(table_nested_complexList_final[,4], collapse=",", sep=""), ",", fixed=T)[[1]])
all_non_shared_prots = setdiff(V(PIN_full)$name, all_shared_prots)
length_shared = length(all_shared_prots)				# 2239
length_non_shared = length(all_non_shared_prots)			# 17243

GO_enrichment_uids = function(uids, toplength=20) {
	geneids = extended_table[which(extended_table[,2] %in% uids), 1]

	if (length(geneids)>0) {
		GO_MF = NULL; GO_BP = NULL; GO_CC = NULL
		for (i in geneids) {
			GO_MF = c(GO_MF, getOntology(xx[[i]], "MF"))
			GO_BP = c(GO_BP, getOntology(xx[[i]], "BP"))
			GO_CC = c(GO_CC, getOntology(xx[[i]], "CC"))
		}
		if (length(GO_MF)>0) {
			All_GO_MF_Terms = as.matrix(Term(unique(GO_MF)))
			Top_GO_MF_Terms = sort(table(GO_MF), decreasing=T) #[1:toplength]
			names_MF = Term(names(Top_GO_MF_Terms))
			rownames(Top_GO_MF_Terms)=NULL
			GO_enrichment_MF = cbind(names_MF, Top_GO_MF_Terms)
		} else {
			GO_enrichment_MF = "No ontology found for geneID"
		}
		if (length(GO_BP)>0) {
			All_GO_BP_Terms = as.matrix(Term(unique(GO_BP)))
			Top_GO_BP_Terms = sort(table(GO_BP), decreasing=T) #[1:toplength]
			names_BP = Term(names(Top_GO_BP_Terms))
			rownames(Top_GO_BP_Terms)=NULL
			GO_enrichment_BP = cbind(names_BP, Top_GO_BP_Terms)
		} else {
			GO_enrichment_BP = "No ontology found for geneID"
		}
		if (length(GO_CC)>0) {
			All_GO_CC_Terms = as.matrix(Term(unique(GO_CC)))
			Top_GO_CC_Terms = sort(table(GO_CC), decreasing=T) #[1:toplength]
			names_CC = Term(names(Top_GO_CC_Terms))
			rownames(Top_GO_CC_Terms)=NULL
			GO_enrichment_CC = cbind(names_CC, Top_GO_CC_Terms)
		} else {
			GO_enrichment_CC = "No ontology found for geneID"
		}
	} else {
		GO_enrichment_MF = "GeneID not found"
		GO_enrichment_BP = "GeneID not found"
		GO_enrichment_CC = "GeneID not found"
	}

	result = list(GO_enrichment_BP, GO_enrichment_MF, GO_enrichment_CC)
}

tmp = GO_enrichment_uids(all_shared_prots)
BP_enrichment_shared = cbind(tmp[[1]][,1], tmp[[1]][,2])
MF_enrichment_shared = cbind(tmp[[2]][,1], tmp[[2]][,2])
CC_enrichment_shared = cbind(tmp[[3]][,1], tmp[[3]][,2])

BP_enrichment_shared[1:20,]		# Indeed, enriched in regulatory terms.
length(grep("regulat", BP_enrichment_shared[1:20,]))/20		# 25% of top 20 is related to regulation

BP_sum = 0; MF_sum = 0; CC_sum = 0
for (i in 1:100) {
	tmp = GO_enrichment_uids(sample(all_non_shared_prots, length_shared, replace=F))
	BP_enrichment_non_shared = cbind(tmp[[1]][,1], tmp[[1]][,2])
	MF_enrichment_non_shared = cbind(tmp[[2]][,1], tmp[[2]][,2])
	CC_enrichment_non_shared = cbind(tmp[[3]][,1], tmp[[3]][,2])

	BP_sum = BP_sum + dim(BP_enrichment_non_shared)[1]
	MF_sum = MF_sum + dim(MF_enrichment_non_shared)[1]
	CC_sum = CC_sum + dim(CC_enrichment_non_shared)[1]
}
BP_enrichment_non_shared = BP_sum / 100
MF_enrichment_non_shared = MF_sum / 100
CC_enrichment_non_shared = CC_sum / 100

GO_enrichment_analysis = rbind(c(dim(BP_enrichment_shared)[1], BP_enrichment_non_shared), c(dim(MF_enrichment_shared)[1], MF_enrichment_non_shared), c(dim(CC_enrichment_shared)[1], CC_enrichment_non_shared))
colnames(GO_enrichment_analysis) = c("Shared prots", "Non-shared prots")
rownames(GO_enrichment_analysis) = c("BP", "MF", "CC")
GO_enrichment_analysis
# Conclusion: Slightly more terms in shared BP and CC.