diff --git a/scripts/create_db.py b/scripts/create_db.py index e4826ae..ae02d7e 100644 --- a/scripts/create_db.py +++ b/scripts/create_db.py @@ -25,7 +25,7 @@ def get_sheng_yun(pinyin): return None, None if pinyin == "ng": return "", "en" - for i in xrange(2, 0, -1): + for i in range(2, 0, -1): t = pinyin[:i] if t in SHENGMU_DICT: return t, pinyin[len(t):] @@ -34,7 +34,7 @@ def get_sheng_yun(pinyin): def encode_pinyin(pinyin): if pinyin == None or pinyin == "": return 0 - return pinyin_id[pinyin] + return pinyin_id[pinyin] e = 0 for c in pinyin: e = (e << 5) + (ord(c) - ord('a') + 1) @@ -45,7 +45,7 @@ con2.commit() new_freq = 0 freq = 0 -print "INSERTING" +print("INSERTING") for r in con1.execute("SELECT * FROM py_phrase ORDER BY freq"): ylen = r[0] phrase = r[10] @@ -53,11 +53,13 @@ for r in con1.execute("SELECT * FROM py_phrase ORDER BY freq"): freq = r[11] new_freq += 1 - if ylen <= 4: - pys = map(lambda id: ID_PINYIN_DICT[id], r[1: 1 + ylen]) - else: - pys = map(lambda id: ID_PINYIN_DICT[id], r[1: 5]) + r[5].encode("utf8").split("'") - + try: + if ylen <= 4: + pys = [ID_PINYIN_DICT[id] for id in r[1: 1 + ylen]] + else: + pys = [ID_PINYIN_DICT[id] for id in r[1: 5]] + r[5].split("'") + except KeyError: + continue i = ylen - 1 if i >= 15: i = 15 @@ -70,17 +72,17 @@ for r in con1.execute("SELECT * FROM py_phrase ORDER BY freq"): sheng_yun.append(y) - column = [phrase, new_freq] + map(encode_pinyin, sheng_yun) + column = [phrase, new_freq] + list(map(encode_pinyin, sheng_yun)) sql = insert_sql % (i, ",".join(["?"] * len(column))) con2.execute (sql, column) -print "Remove duplicate" -for i in xrange(0, 16): - sql = "DELETE FROM py_phrase_%d WHERE rowid IN (SELECT rowid FROM (SELECT count() as count, rowid FROM py_phrase_%d GROUP by %s,phrase) WHERE count > 1)" % (i, i, ",".join(map(lambda i: "s%d,y%d"%(i,i), range(0, i + 1)))) +print("Remove duplicate") +for i in range(0, 16): + sql = "DELETE FROM py_phrase_%d WHERE rowid IN (SELECT rowid FROM (SELECT count() as count, rowid FROM py_phrase_%d GROUP by %s,phrase) WHERE count > 1)" % (i, i, ",".join(["s%d,y%d"%(i,i) for i in range(0, i + 1)])) con2.execute(sql) con2.commit() -print "CACUUM" +print("CACUUM") con2.execute("VACUUM;") con2.commit()