1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
| import chardet
import pandas as pd
def get_file_encoding(file_path): f = open(file_path, 'rb') file_encoding = chardet.detect(f.read())['encoding'] f.close() return file_encoding
def get_df(file_path, id_name, field_list): file_encoding = get_file_encoding(file_path)
df = pd.read_csv(file_path, encoding=file_encoding).set_index([id_name])[field_list].fillna('').sort_index() return df
def contrast_df(new_path, old_path, id_name, field_list, save_file_path='./auto_create/'): field_list.pop(field_list.index(id_name))
file_name = new_path.split('/')[-1]
new_df = get_df(new_path, id_name, field_list) old_df = get_df(old_path, id_name, field_list)
old_index = old_df.index.values add_df = new_df[~new_df.index.isin(old_index)] if not add_df.empty: print('有{}条数据新插入了'.format(len(add_df))) add_index = add_df.index.values new_df = new_df[~new_df.index.isin(add_index)] add_df.to_csv(save_file_path + 'add_' + file_name) else: print('没有数据新插入') ne_stacked = (new_df != old_df).stack() changed = ne_stacked[ne_stacked] changed.index.names = ['ID_P', 'col'] changed_index = set([seri[0] for seri in changed.index]) update_df = new_df[new_df.index.isin(changed_index)] if not update_df.empty: print('有{}条数据更新'.format(len(update_df))) update_df.to_csv(save_file_path + 'update_' + file_name) else: print('没有数据更新')
return add_df, update_df
|