-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
82 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
""" | ||
merge csv files inside a directory into one csv file, optinally, select | ||
column value to be fitting a regular expression, e.g. params={key1:val1, | ||
key2:val2}, let pattern=val2 | ||
""" | ||
import os | ||
import sys | ||
import argparse | ||
import glob | ||
import re | ||
import pandas as pd | ||
|
||
|
||
# Set up argument parsing | ||
parser = argparse.ArgumentParser( | ||
description="merge all CSV files in a specified directory.") | ||
|
||
parser.add_argument('directory', type=str, | ||
help="The directory to search for CSV files") | ||
|
||
parser.add_argument('--pattern', type=str, | ||
help="The pattern to match in the specified column") | ||
|
||
parser.add_argument('--out_file_name', type=str, | ||
default="merged.csv", | ||
help="out csv name") | ||
|
||
parser.add_argument('--column_name', type=str, | ||
default=None, | ||
help="The column name to filter by") | ||
parser.add_argument('--filter_value', type=str, | ||
default=None, | ||
help="The value to match in the specified column") | ||
|
||
# Parse the arguments | ||
args = parser.parse_args() | ||
|
||
# Use the directory passed from the command line | ||
directory = args.directory | ||
column_name = args.column_name | ||
filter_value = args.filter_value | ||
|
||
# List of CSV files to be merged | ||
csv_files = glob.glob(os.path.join(directory, "*.csv")) | ||
|
||
if not column_name: | ||
merged_df = pd.concat([pd.read_csv(file) for file in csv_files]) | ||
|
||
else: | ||
dfs = [] | ||
if args.pattern: | ||
pattern_re = re.compile(args.pattern) | ||
# Loop through each file and filter the rows based on the column value | ||
for file in csv_files: | ||
df = pd.read_csv(file, skipinitialspace=True) | ||
# Check if the column exists in the current file | ||
if column_name in df.columns: | ||
# Filter rows where the column value matches the filter value | ||
if filter_value: | ||
filtered_df = df[df[column_name] == filter_value] | ||
else: | ||
filtered_df = \ | ||
df[df[column_name].astype(str).apply( | ||
lambda x: bool(pattern_re.search(x)))] | ||
# Append the filtered dataframe to the list if it's not empty | ||
if not filtered_df.empty: | ||
dfs.append(filtered_df) | ||
else: | ||
raise RuntimeError(f"column name {column_name} not found in {file}") | ||
|
||
# Combine all files in the list | ||
if dfs: | ||
merged_df = pd.concat(dfs) | ||
else: | ||
print("Merg CSV failed") | ||
sys.exit() | ||
|
||
|
||
# Save the merged file | ||
merged_df.to_csv(args.out_file_name, index=False) | ||
|
||
print(f"Merged CSV saved as {args.out_file_name}") |