只要某文件夹下所有的 csv 文件结构相同,在文件夹路径运行以下代码就能自动合并,输出结果在 all.csv ,结果 csv 在原有的 csv 结构上新增一列 origin_file_name,值为原来的 csv 文件名,保证了没有信息的衰减。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 import osimport pandas as pdresult_csv = 'all.csv' all_cols = [] for file in os.listdir('.' ): if file.endswith('.csv' ) and not file == result_csv: df = pd.read_csv(file) all_cols = df.columns.values.tolist() break if len (all_cols) == 0 : raise Exception("当前目录下没有要合并的 csv 文件" ) all_cols.insert(0 , 'origin_file_name' ) all_df = pd.DataFrame({col: [] for col in all_cols}) for file in os.listdir('.' ): if file.endswith('.csv' ) and not file == result_csv: df = pd.read_csv(file) df.insert(0 , 'origin_file_name' , [file for _ in range (df.shape[0 ])]) all_df = all_df.append(df, ignore_index=True ) all_df.to_csv(result_csv, index=False , encoding='utf-8' )
2023.10.30 日更新,如果上面代码耗时较多,可尝试使用以下代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 import pandas as pdimport osdef do_merge (input_folder, output_file='all.csv' , append_file_name_col=True , file_name_col='origin_file_name' ): result_csv = output_file all_cols = [] if not os.path.exists(input_folder): raise Exception(f"目录 {input_folder} 不存在" ) file_cnt = len (os.listdir(input_folder)) for file in os.listdir(input_folder): if file.endswith('.csv' ) and not file == result_csv: df = pd.read_csv(os.path.join(input_folder, file)) all_cols = df.columns.values.tolist() break if len (all_cols) == 0 : raise Exception(f"当前目录 {os.path.abspath(input_folder)} 下没有要合并的 csv 文件" ) if append_file_name_col: all_cols.insert(0 , file_name_col) save_cols = all_cols df_list = [] for index, file in enumerate (os.listdir(input_folder)): print (f'{index + 1 } / {file_cnt} {file} ' ) if file.endswith('.csv' ) and not file == result_csv: file_name = file[:file.rindex('.' )] df = pd.read_csv(os.path.join(input_folder, file), float_precision='high' ) if append_file_name_col: df.insert(0 , file_name_col, [file_name for _ in range (df.shape[0 ])]) df = df[save_cols] df_list.append(df) all_df = pd.concat(df_list, ignore_index=True ) print (all_df.shape[0 ]) subset_ = [] if append_file_name_col: subset_.append(file_name_col) all_df.drop_duplicates(subset=subset_, inplace=True , keep='first' ) print (all_df.shape[0 ]) all_df.to_csv(result_csv, index=False , encoding='utf-8-sig' ) if __name__ == '__main__' : do_merge(input_folder='./' )