Scripts/Fast File Read

From J Wiki
Jump to navigation Jump to search

Fast file Reading

Fast extraction of a relatively small amount of data from a large file is easy with J, using mapped files.

For example, extracting distinct ip addresses from a log file can be as follows:

J version: Extract distinct IP addresses following a label from a large file

findInFile=: 4 : 0
NB. find data in a file
NB. x: label preceding data
NB. y: the file name
NB. the data is followed by a blank space
	
	JCHAR map_jmf_ 'file';y         NB. mapped files realy speed things up
	nos=. x I.@:E. file 		NB. find the positions of the label in file
	ip=.(nos+/(#x)+i.16){file 	NB. matrix with maximum no. of columns
	unmap_jmf_ 'file'
	~.({."0 1~  (i."1 &' '))ip 	NB. extract data from each line until a blank is found
								NB. return the unique values
)

On a small Acer Aspire One J took about 0.65 secs to extract 16000 ip addresses and the 10 distinct ip addresses from a 38 meg file.

This program could have been written in C, saving perhaps 0.30 secs, but with a bit more effort.

test=: 3 : 0
	file=:'testfile.2'
	out=.,(20000 2000$' '),.~' rhost=',"1(' '-.~"1(}:"1 (20000 16$,'.',"1~":>:?40 1$255)))
	out fwrite file
	label =.' rhost='
	label findInFile file
	ferase file
)

C version: Extract distinct IP addresses following a label from a large file

//////////////////////////////////////////////////////
//                                                  //
//  Small C mmap() sample.                          //
//  Written by Martin Cyr.                          //
//  Feel free to change and distribute, but credit  //
//  is always nice. If you use, I'd be pleased to   //
//  hear from you at Spooles at GMail dot com.      //
//                                                  //
//////////////////////////////////////////////////////

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/fcntl.h>

#define INITIAL_CAPACITY 25
#define IP_CHAR_LEN 16
#define LINE_CHAR_LEN 1024
#define PATTERN " rhost="

void showUsage();
int processFile(char*, char***);
int addNextHost(char***, int, int*, char*, int);
int countMatches(char*, char**, int);
void printDestroyArray(char**, int);

int main(int argc, char** argv)
{
	char** hosts;
	int hostCount;

	if (argc <= 1)
		showUsage(argv[0]);
	else if (argc == 2)
	{
		hostCount = processFile(argv[1], &hosts);
		printDestroyArray(hosts, hostCount);
		free(hosts);
	}
	else
		showUsage(argv[0]);
}

void showUsage(char* filename)
{
	printf("Usage: %s <filename>\n", filename);
	printf("\tParses the <filename> for occurences of rhost= \n");
	printf("\tand sends everything to stdout\n");
}

int countMatches(char* match, char** array, int count)
{
	int i, ret = 0;
	for (i = 0; i < count; i++)
	{
		if (strcmp(match, array[i]) == 0)
			ret++;
	}
	return ret;	
}

void printDestroyArray(char** array, int count)
{
	int i;

	for (i = 0; i < count; i++)
	{
		printf("%s\n", array[i]);
		free(array[i]);
	}	
}

int processFile(char* filename, char*** hosts)
{
	int c;
	int hostCount = 0, hostMax = INITIAL_CAPACITY;
	int match = 0, patternLen = strlen(PATTERN);
	int fd;
	int result;
	int i;
	char* map;
	struct stat results;
	
	(*hosts) = (char**)malloc(hostMax * IP_CHAR_LEN * sizeof(char));

	fd = open(filename, O_RDONLY);
	if (fd == -1)
	{
		perror("Error opening file");
		exit(EXIT_FAILURE);
	}

	if (stat(filename, &results) != 0)
	{
		perror("Unable to get file stats");
		exit(EXIT_FAILURE);
	}

	map = mmap(0, results.st_size, PROT_READ, MAP_PRIVATE, fd, 0);

	if (map == MAP_FAILED)
	{
		perror("Error mapping the file");
		exit(EXIT_FAILURE);
	}

	for (i = 0; i < results.st_size; i++)
	{
		if (map[i] != PATTERN[match++])
			match = 0;

		if (match == patternLen)
		{
			hostCount = addNextHost(hosts, hostCount, &hostMax, map, i+1);
			match = 0;
		}
	}

	if (munmap(map, results.st_size) == -1)
	{
		perror("Error unmapping the file");
	}

	close(fd);
	return hostCount;
}

int addNextHost(char*** hosts, int hostNum, int* hostMax, char* map, int offset)
{
	char host[IP_CHAR_LEN];
	int pos = 0;

	if (hostNum > *hostMax)
	{
		*hostMax *= 2;
		*hosts = (char**)realloc(*hosts, IP_CHAR_LEN * (*hostMax) * sizeof(char));
	}

	while ((map[offset+pos] != ' ') && (map[offset+pos] != '\n') && (map[offset+pos] != '\r') && (map[offset+pos] != '\t'))
	{

            host[pos] = map[offset+pos];
		pos++;
	}

	host[pos] = 0;

	if ((pos > 0) && (countMatches(host, *hosts, hostNum) == 0))
	{
		(*hosts)[hostNum] = (char*)calloc(IP_CHAR_LEN, sizeof(char));
		strncpy((*hosts)[hostNum], host, pos);
		hostNum++;
	}

	return hostNum;
}