只要某文件夹下所有的 csv 文件结构相同,在文件夹路径运行以下代码就能自动合并,输出结果在 all.csv ,结果 csv 在原有的 csv 结构上新增一列 origin_file_name,值为原来的 csv 文件名,保证了没有信息的衰减。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# -*- coding: utf-8 -*-
# author: inspurer(月小水长)
# create_time: 2022/4/13 10:33
# 运行环境 Python3.6+
# github https://github.com/inspurer
# website https://buyixiao.github.io/
# 微信公众号 月小水长

import os
import pandas as pd

result_csv = 'all.csv'
all_cols = []
for file in os.listdir('.'):
if file.endswith('.csv') and not file == result_csv:
df = pd.read_csv(file)
all_cols = df.columns.values.tolist()
break
if len(all_cols) == 0:
raise Exception("当前目录下没有要合并的 csv 文件")
all_cols.insert(0, 'origin_file_name')
all_df = pd.DataFrame({col: [] for col in all_cols})

for file in os.listdir('.'):
if file.endswith('.csv') and not file == result_csv:
df = pd.read_csv(file)
df.insert(0, 'origin_file_name', [file for _ in range(df.shape[0])])
all_df = all_df.append(df, ignore_index=True)

all_df.to_csv(result_csv, index=False, encoding='utf-8')

2023.10.30 日更新,如果上面代码耗时较多,可尝试使用以下代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# -*- coding: utf-8 -*-
# author: inspurer(月小水长)
# create_time: 2023/10/30 15:23
# 运行环境 Python3.6+
# github https://github.com/inspurer
# website https://buyixiao.github.io/
# 微信公众号 月小水长

import pandas as pd

import os


def do_merge(input_folder, output_file='all.csv', append_file_name_col=True, file_name_col='origin_file_name'):
result_csv = output_file
all_cols = []
if not os.path.exists(input_folder):
raise Exception(f"目录 {input_folder} 不存在")

file_cnt = len(os.listdir(input_folder))
for file in os.listdir(input_folder):
if file.endswith('.csv') and not file == result_csv:
df = pd.read_csv(os.path.join(input_folder, file))
all_cols = df.columns.values.tolist()
break
if len(all_cols) == 0:
raise Exception(f"当前目录 {os.path.abspath(input_folder)}下没有要合并的 csv 文件")

if append_file_name_col:
all_cols.insert(0, file_name_col)

save_cols = all_cols
df_list = []

for index, file in enumerate(os.listdir(input_folder)):
print(f'{index + 1}/ {file_cnt} {file}')
if file.endswith('.csv') and not file == result_csv:
file_name = file[:file.rindex('.')]
df = pd.read_csv(os.path.join(input_folder, file), float_precision='high')

if append_file_name_col:
df.insert(0, file_name_col, [file_name for _ in range(df.shape[0])])

df = df[save_cols]

df_list.append(df)

all_df = pd.concat(df_list, ignore_index=True)

print(all_df.shape[0])
# subset_ = ['unique col name of your dataframe']
subset_ = []
if append_file_name_col:
subset_.append(file_name_col)
all_df.drop_duplicates(subset=subset_, inplace=True, keep='first')
print(all_df.shape[0])

all_df.to_csv(result_csv, index=False, encoding='utf-8-sig')


if __name__ == '__main__':
do_merge(input_folder='./')