I have a large text file (around 10 GB) which contains number of stories. Each story starts with the marker $$
. Below is a sample of the file:
$$
AA This is story 1
BB 345
$$
AA This is story 2
BB开发者_开发问答 456
I want to split this file into pieces of around 250 MB size. But none of the stories should be divided into two different files.
Can anyone help me with the Unix or Perl code for this?
use strict;
use warnings;
use autodie;
$/ = "\$\$\n";
my $targetsize = 250*1024*1024;
my $fileprefix = 'chunk';
my $outfile = 0;
my $outfh;
my $outsize = 0;
while (my $story = <>) {
chomp($story);
next unless $story; # disregard initial empty chunk
$story = "$/$story";
# no file open yet, or this story takes us farther from the target size
if ( ! $outfile || abs($outsize - $targetsize) < abs($outsize + length($story) - $targetsize) ) {
++$outfile;
open $outfh, '>', "$fileprefix$outfile";
$outsize = 0;
}
$outsize += length($story);
print $outfh $story;
}
csplit is what you want. It does the same as split
but based on a pattern.
Alternative in C++ (not tested):
#include <boost/shared_ptr.hpp>
#include <sstream>
#include <iostream>
#include <fstream>
#include <string>
void new_output_file(boost::shared_ptr<std::ofstream> &out, const char *prefix)
{
static int i = 0;
std::ostringstream filename;
filename << prefix << "_" << i++;
out.reset(new std::ofstream(filename));
}
int main(int argc, char **argv)
{
std::ifstream in(argv[1]);
int i = 0;
long size = 0;
const long max_size = 200 * 1024 * 1024;
std::string line;
boost::shared_ptr<std::ofstream> out(NULL);
new_output_file(out, argv[2]);
while(in.good())
{
std::getline(in,line);
size += line.length() + 1 /* line termination char */;
if(size >= max_size && line.length() && line[0] == '$' && line[1] == '$')
{
new_output_file(out, argv[2]);
size = line.length() + 1;
}
out << line << std::endl;
}
return 0;
}
I have modified the code of ysth and found it working. Please suggest if you think, you can modify this to make it better.
use strict;
use warnings;
my $targetsize = 50*1024*1024;
my $fileprefix = 'chunk';
my $outfile = 0;
my $outsize = 0;
my $outfh;
my $temp='';
while (my $line = <>) {
chomp($line);
next unless $line;
# discard initial empty chunk
if($line =~ /^\$\$$/ || $outfile == 0){
$outsize += length($temp);
if ( $outfile == 0 || ($outsize - $targetsize) > 0) {
++$outfile;
if($outfh) {close($outfh);}
open $outfh, '>', "$fileprefix$outfile";
$outsize = 0;
}
$temp='';
}
$temp = $temp.$line;
print $outfh "$line\n";
}
精彩评论