Michael Lando | 451a340 | 2017-02-19 10:28:42 +0200 | [diff] [blame] | 1 | import json |
| 2 | import sys, getopt |
| 3 | from collections import OrderedDict |
| 4 | |
| 5 | dict = {} |
| 6 | dupliacteUid = {} |
| 7 | #debugFlag = True |
| 8 | debugFlag = False |
| 9 | |
| 10 | def join_strings(lst): |
| 11 | concat = "" |
| 12 | for string in lst: |
| 13 | if (string != None): |
| 14 | if (type(string) == int): |
| 15 | string = str(string) |
| 16 | concat += (string + " ") |
| 17 | return concat |
| 18 | |
| 19 | def debug(desc, *args): |
| 20 | 'print only if debug enabled' |
| 21 | if (debugFlag == True): |
| 22 | print desc, join_strings(args) |
| 23 | |
| 24 | def log(desc, arg): |
| 25 | 'print log info' |
| 26 | print desc, arg |
| 27 | |
| 28 | def getUid(vertex): |
| 29 | uid = None |
| 30 | nodeLabel=vertex.get('nodeLabel') |
| 31 | debug(nodeLabel) |
| 32 | if ( nodeLabel == 'user' ): |
| 33 | uid = vertex['userId'] |
| 34 | elif ( nodeLabel == 'tag' ): |
| 35 | uid = vertex['name'] |
| 36 | elif ( nodeLabel == None ): |
| 37 | pass |
| 38 | elif ( nodeLabel == 'lockNode' ): |
| 39 | uid = vertex.get('uid') |
| 40 | else: uid = vertex['uid'] |
| 41 | |
| 42 | debug(nodeLabel, uid) |
| 43 | |
| 44 | return uid |
| 45 | |
| 46 | def generateFile(inputFile, outputFile): |
| 47 | |
| 48 | with open(inputFile) as json_file: |
| 49 | dupliacteUid = {} |
| 50 | json_data = json.load(json_file) |
| 51 | for x in json_data['vertices']: |
| 52 | uid = getUid(x) |
| 53 | |
| 54 | existId = dict.get(uid) |
| 55 | if (existId == None): |
| 56 | dict[uid] = x.get('_id') |
| 57 | else: |
| 58 | dupliacteUid[uid] = existId |
| 59 | |
| 60 | log("duplicate ids", dupliacteUid) |
| 61 | |
| 62 | json_data_vertices = json_data['vertices'] |
| 63 | log("number of vertices is", len(json_data_vertices)) |
| 64 | |
| 65 | ids = {} |
| 66 | deleteIndexes = [] |
| 67 | |
| 68 | for i in xrange(len(json_data_vertices)): |
| 69 | #print "****** ", i, " *************" |
| 70 | #print json_data_vertices[i] |
| 71 | id = json_data_vertices[i]["_id"] |
| 72 | uid = getUid(json_data_vertices[i]) |
| 73 | isDuplicateId = dupliacteUid.get(uid) |
| 74 | if (isDuplicateId != None): |
| 75 | debug("uid to id pair", uid if uid != None else 'None', id) |
| 76 | value = ids.get(uid) |
| 77 | if (value == None): |
| 78 | list = [id,] |
| 79 | ids[uid] = list |
| 80 | else: |
| 81 | value.append(id) |
| 82 | deleteIndexes.append(id) |
| 83 | |
| 84 | log("ids", ids) |
| 85 | log("deleteIndexes", deleteIndexes) |
| 86 | log("deleteIndexes size", len(deleteIndexes)) |
| 87 | |
| 88 | filter_vertex = [ x for x in json_data_vertices if x.get('_id') not in deleteIndexes ] |
| 89 | json_data['vertices'] = filter_vertex |
| 90 | |
| 91 | log("number of vertexes after filter", len(filter_vertex)) |
| 92 | |
| 93 | json_data_edges = json_data['edges'] |
| 94 | |
| 95 | log("number of edges", len(json_data_edges)) |
| 96 | |
| 97 | filter_edge = [ x for x in json_data_edges if x['_outV'] not in (deleteIndexes) and x['_inV'] not in (deleteIndexes) ] |
| 98 | json_data['edges'] = filter_edge |
| 99 | |
| 100 | log("number of edges after filter", len(json_data['edges'])) |
| 101 | |
| 102 | json_data = OrderedDict(sorted(json_data.items(), key=lambda t: t[0], reverse=True)) |
| 103 | |
| 104 | with open(outputFile, 'w') as outfile: |
| 105 | #json.dump(json_data, outfile) |
| 106 | json.dump(json_data, outfile) |
| 107 | log("output file is", outputFile); |
| 108 | |
| 109 | def main(argv): |
| 110 | print 'Number of arguments:', len(sys.argv), 'arguments.' |
| 111 | inputfile = None |
| 112 | outputfile = '' |
| 113 | try: |
| 114 | opts, args = getopt.getopt(argv,"h:i:o:",["ifile=","ofile="]) |
| 115 | except getopt.GetoptError: |
| 116 | print sys.argv[0], '-i <inputfile>' |
| 117 | sys.exit(2) |
| 118 | for opt, arg in opts: |
| 119 | if opt == '-h': |
| 120 | print sys.argv[0], '-i <inputfile>' |
| 121 | sys.exit(3) |
| 122 | elif opt in ("-i", "--ifile"): |
| 123 | inputfile = arg |
| 124 | |
| 125 | if ( inputfile == None ): |
| 126 | print sys.argv[0] ,'-i <inputfile>' |
| 127 | sys.exit(3) |
| 128 | |
| 129 | print 'Input file is "', inputfile |
| 130 | generateFile(inputfile, inputfile + '.noduplicates') |
| 131 | |
| 132 | |
| 133 | if __name__ == "__main__": |
| 134 | main(sys.argv[1:]) |
| 135 | |
| 136 | # print x['uid'] |