python – Splitting one csv into multiple files

python – Splitting one csv into multiple files

In Python

Use readlines() and writelines() to do that, here is an example:

>>> csvfile = open(import_1458922827.csv, r).readlines()
>>> filename = 1
>>> for i in range(len(csvfile)):
...     if i % 1000 == 0:
...         open(str(filename) + .csv, w+).writelines(csvfile[i:i+1000])
...         filename += 1

the output file names will be numbered 1.csv, 2.csv, … etc.

From terminal

FYI, you can do this from the command line using split as follows:

$ split -l 1000 import_1458922827.csv

I suggest you not inventing a wheel. There is existing solution. Source here

import os


def split(filehandler, delimiter=,, row_limit=1000,
          output_name_template=output_%s.csv, output_path=., keep_headers=True):
    import csv
    reader = csv.reader(filehandler, delimiter=delimiter)
    current_piece = 1
    current_out_path = os.path.join(
        output_path,
        output_name_template % current_piece
    )
    current_out_writer = csv.writer(open(current_out_path, w), delimiter=delimiter)
    current_limit = row_limit
    if keep_headers:
        headers = reader.next()
        current_out_writer.writerow(headers)
    for i, row in enumerate(reader):
        if i + 1 > current_limit:
            current_piece += 1
            current_limit = row_limit * current_piece
            current_out_path = os.path.join(
                output_path,
                output_name_template % current_piece
            )
            current_out_writer = csv.writer(open(current_out_path, w), delimiter=delimiter)
            if keep_headers:
                current_out_writer.writerow(headers)
        current_out_writer.writerow(row)

Use it like:

split(open(/your/pat/input.csv, r));

python – Splitting one csv into multiple files

A python3-friendly solution:

def split_csv(source_filepath, dest_folder, split_file_prefix,
                records_per_file):
    
    Split a source csv into multiple csvs of equal numbers of records,
    except the last file.

    Includes the initial header row in each split file.

    Split files follow a zero-index sequential naming convention like so:

        `{split_file_prefix}_0.csv`
    
    if records_per_file <= 0:
        raise Exception(records_per_file must be > 0)

    with open(source_filepath, r) as source:
        reader = csv.reader(source)
        headers = next(reader)

        file_idx = 0
        records_exist = True

        while records_exist:

            i = 0
            target_filename = f{split_file_prefix}_{file_idx}.csv
            target_filepath = os.path.join(dest_folder, target_filename)

            with open(target_filepath, w) as target:
                writer = csv.writer(target)

                while i < records_per_file:
                    if i == 0:
                        writer.writerow(headers)

                    try:
                        writer.writerow(next(reader))
                        i += 1
                    except StopIteration:
                        records_exist = False
                        break

            if i == 0:
                # we only wrote the header, so delete that file
                os.remove(target_filepath)

            file_idx += 1

Leave a Reply

Your email address will not be published. Required fields are marked *