blob: a4bd35dd2b2beddaa076af0e00d6dcb332937e15 [file] [log] [blame]
import json
import sys, getopt
from collections import OrderedDict
dict = {}
dupliacteUid = {}
#debugFlag = True
debugFlag = False
def join_strings(lst):
concat = ""
for string in lst:
if (string != None):
if (type(string) == int):
string = str(string)
concat += (string + " ")
return concat
def debug(desc, *args):
'print only if debug enabled'
if (debugFlag == True):
print desc, join_strings(args)
def log(desc, arg):
'print log info'
print desc, arg
def getUid(vertex):
uid = None
nodeLabel=vertex.get('nodeLabel')
debug(nodeLabel)
if ( nodeLabel == 'user' ):
uid = vertex['userId']
elif ( nodeLabel == 'tag' ):
uid = vertex['name']
elif ( nodeLabel == None ):
pass
elif ( nodeLabel == 'lockNode' ):
uid = vertex.get('uid')
else: uid = vertex['uid']
debug(nodeLabel, uid)
return uid
def generateFile(inputFile, outputFile):
with open(inputFile) as json_file:
dupliacteUid = {}
json_data = json.load(json_file)
for x in json_data['vertices']:
uid = getUid(x)
existId = dict.get(uid)
if (existId == None):
dict[uid] = x.get('_id')
else:
dupliacteUid[uid] = existId
log("duplicate ids", dupliacteUid)
json_data_vertices = json_data['vertices']
log("number of vertices is", len(json_data_vertices))
ids = {}
deleteIndexes = []
for i in xrange(len(json_data_vertices)):
#print "****** ", i, " *************"
#print json_data_vertices[i]
id = json_data_vertices[i]["_id"]
uid = getUid(json_data_vertices[i])
isDuplicateId = dupliacteUid.get(uid)
if (isDuplicateId != None):
debug("uid to id pair", uid if uid != None else 'None', id)
value = ids.get(uid)
if (value == None):
list = [id,]
ids[uid] = list
else:
value.append(id)
deleteIndexes.append(id)
log("ids", ids)
log("deleteIndexes", deleteIndexes)
log("deleteIndexes size", len(deleteIndexes))
filter_vertex = [ x for x in json_data_vertices if x.get('_id') not in deleteIndexes ]
json_data['vertices'] = filter_vertex
log("number of vertexes after filter", len(filter_vertex))
json_data_edges = json_data['edges']
log("number of edges", len(json_data_edges))
filter_edge = [ x for x in json_data_edges if x['_outV'] not in (deleteIndexes) and x['_inV'] not in (deleteIndexes) ]
json_data['edges'] = filter_edge
log("number of edges after filter", len(json_data['edges']))
json_data = OrderedDict(sorted(json_data.items(), key=lambda t: t[0], reverse=True))
with open(outputFile, 'w') as outfile:
#json.dump(json_data, outfile)
json.dump(json_data, outfile)
log("output file is", outputFile);
def main(argv):
print 'Number of arguments:', len(sys.argv), 'arguments.'
inputfile = None
outputfile = ''
try:
opts, args = getopt.getopt(argv,"h:i:o:",["ifile=","ofile="])
except getopt.GetoptError:
print sys.argv[0], '-i <inputfile>'
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print sys.argv[0], '-i <inputfile>'
sys.exit(3)
elif opt in ("-i", "--ifile"):
inputfile = arg
if ( inputfile == None ):
print sys.argv[0] ,'-i <inputfile>'
sys.exit(3)
print 'Input file is "', inputfile
generateFile(inputfile, inputfile + '.noduplicates')
if __name__ == "__main__":
main(sys.argv[1:])
# print x['uid']