开发者

std::merge using 2 mmaped arrays?

开发者 https://www.devze.com 2023-02-07 11:05 出处:网络
I\'m mmaping two text files with an integer written on each line. I read them from the drive and I wanted to do a sorted merge on them.

I'm mmaping two text files with an integer written on each line. I read them from the drive and I wanted to do a sorted merge on them. The two input files "1piece0" and "1piece1" have a list of sorted integers. The output file does have the size as the two files combined, but not that many integers. Problem: The two input files have 25430000 lines, while the output file should have 50860000 lines but it has only has 17259463 lines. This is my current code.

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <algorithm>

#define FILESIZE 25430000 * sizeof(int)
#define FILE0 279288034
#define FILE1 279287226
int main()
{
    int i;
    int fd;
    int fd2;
    int fd3;
    int result;
    int *map;
    int *map2;
    int *map3;

    fd3 = open( "file.out", O_RDWR | O_CREAT | O_TRUNC, (mode_t)0755);
    if ( fd3 == -1 ) {
        perror("Error opening file for writing");
        exit(EXIT_FAILURE);
    }
    result = lseek(fd3, FILE0 + FILE1 - 1, SEEK_SET );
    if(result == -1) {
        close(fd);
        perror("Error calling lseek\n");
        exit(EXIT_FAILURE);
    }

    result = write(fd3,"",1);
    if( result != 1 ) {
        close(fd3);
        perror("error writing last byte");
        exit(EXIT_FAILURE);
    }
    map3 =(int *) mmap(0, FILE0 + FILE1, PROT_READ | PROT_WRITE, MAP_SHARED, fd3, 0);
    if( map == MAP_FAILED ) {
        close(fd);
        perror("Error mmapinG fd3");
        exit(EXIT_FAILURE);
    }


    fd = open( "1piece0", O_RDONLY );
    if( fd == -1 ) {
        perror("Error opening file for writing");
        exit(EXIT_FAILURE);
    }

    map = (int *)mmap(0, FILE0, PROT开发者_如何学C_READ, MAP_SHARED, fd, 0 );
    if( map == MAP_FAILED ) {
        close(fd);
        perror("error mapping file");
        exit(EXIT_FAILURE);
    }

    fd2 = open( "1piece1", O_RDONLY );
    if( fd2 == -1 ) {
        perror("Error opening file for writing");
        exit(EXIT_FAILURE);
    }

    map2 = (int *)mmap(0, FILE1, PROT_READ, MAP_SHARED, fd2, 0 );
    if( map == MAP_FAILED ) {
        close(fd2);
        perror("error mapping file");
        exit(EXIT_FAILURE);
    }

//  while(1);
    std::merge( map, map + 25430000, map2, map2 + 25430000, map3 );

    if(munmap(map, FILE0 ) == -1 ) {
        perror("error unmapping map");
    }
    close(fd);

    if(munmap(map3, FILE0 + FILE1 ) == -1 ) {
        perror("error unmapping map3");
    }
    close(fd3);

    if(munmap(map2, FILE1 ) == -1 ) {
        perror("error unmapping map2");
    }
    close(fd2);

    return 0;
}

Can you please tell me what I am doing wrong?

Update: By lines I mean an integer number and then a newline character.


You cannot treat text lines as binary blobs to be manipulated as int pointers.

You can treat text files as text to be extracted and used:

void merge_ints(std::istream &a_in, std::istream &b_in, std::ostream &out) {
  int a, b;
  std::istream *remaining = 0;
  if (!(a_in >> a)) {
    remaining = &b_in;
  }
  else if (!(b_in >> b)) {
    out << a << '\n';
    remaining = &a_in;
  }
  else while (a_in && b_in) {
    if (a < b) {
      out << a << '\n';
      if (!(a_in >> a)) {
        out << b << '\n';
        remaining = &b_in;
      }
    }
    else {
      out << b << '\n';
      if (!(b_in >> b)) {
        out << a << '\n';
        remaining = &a_in;
      }
    }
  }
  for (int x; *remaining >> x;) {
    out << x << '\n';
  }
}

Taking advantage of std::merge:

void merge_ints(std::istream &a, std::istream &b, std::ostream &out) {
  typedef std::istream_iterator<int> In;
  std::merge(In(a), In(), In(b), In(), std::ostream_iterator<int>(out, "\n"));
}

int main() {
  stringstream a ("1\n3\n5\n"), b ("2\n4\n6\n7\n"), out;
  merge_ints(a, b, out);
  cout << out.str();
}


What do you mean by "lines"?

When you memory map it treats the data like it is memory and here you are reading it like an array of ints. Therefore the inputs must be in native binary format (i.e. with the bytes stored in the same way, same size and same endianness), and 25430000 is the number of ints you are reading in from each collection.

Is that how your inputs are stored?

There are a lot of "magic numbers" here.

0

精彩评论

暂无评论...
验证码 换一张
取 消