/* select - extract fields from simple text databases
**
** Copyright  2000 by Jef Poskanzer <jef@mail.acme.com>.
** All rights reserved.
**
** Redistribution and use in source and binary forms, with or without
** modification, are permitted provided that the following conditions
** are met:
** 1. Redistributions of source code must retain the above copyright
**    notice, this list of conditions and the following disclaimer.
** 2. Redistributions in binary form must reproduce the above copyright
**    notice, this list of conditions and the following disclaimer in the
**    documentation and/or other materials provided with the distribution.
** 
** THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
** ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
** ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
** FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
** DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
** OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
** HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
** LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
** OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
** SUCH DAMAGE.
*/


#include <sys/types.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <regex.h>

#ifndef REG_BASIC
#define REG_BASIC 0
#endif


#include "strlist.h"
#include "strtab.h"


/* Input modes. */
#define IM_UNSPEC 0
#define IM_MERGE 1
#define IM_SCHEMA 2
#define IM_NAMEVALUE 3

/* Conditionals. */
struct cond {
    const char* field_name;
    int op;
    const char* str;
    regex_t re;	/* if the str contains wildcards */
    int val;	/* if the op is numeric */
    };

/* Operators for conditionals. */
#define OP_EQUALS 1		/* = */
#define OP_NOT_EQUALS 2		/* != */
#define OP_WILDCARD 3		/* = if the str contains wildcards */
#define OP_NOT_WILDCARD 4	/* != if the str contains wildcards */
#define OP_LESS 5		/* < */
#define OP_LESS_EQUALS 6	/* <= */
#define OP_GREATER 7		/* > */
#define OP_GREATER_EQUALS 8	/* >= */


/* Globals. */

static char* argv0;
static char* field_seps;
static int multiple_seps;
static int or_mode;
static int ignore_case;
static int all_fields;


/* Forwards. */
static void usage( void );
static void init_get_line( FILE* fp );
static char* get_line( FILE* fp );
static void parse_fields( char* line, strlist list );
static int check_conds( strtab record_tab, struct cond* conds, int num_conds );
static int check_cond( strtab record_tab, struct cond* c );
static void* malloc_check( size_t size );
static void* realloc_check( void* ptr, size_t size );
static const char* strdup_check( const char* str );
static strtab strlist_new_check( void );
static void strlist_add_check( strlist s, const char* str );
static strtab strtab_new_check( void );
static void strtab_add_check( strtab s, const char* name, const char* value );
static void check( void* ptr );


int
main( int argc, char** argv )
    {
    int argn;
    int input_mode;
    char* schema_file_name;
    strlist field_names = (strlist) 0;
    const char* field_name;
    const char* field_value;
    strlist file_names;
    const char* file_name;
    int num_conds, max_conds;
    struct cond* conds;
    strlist schema;
    char* cp;
    char* name_end;
    char* wildcards = ".*[^$";  /* only a few of the valid r.e. wildcards */
    int file_num, field_num;
    FILE* fp;
    char* line;
    strlist record_list;
    strtab record_tab;

    argv0 = argv[0];

    /* Parse args. */
    argn = 1;
    input_mode = IM_UNSPEC;
    schema_file_name = (char*) 0;
    field_seps = "\t";
    multiple_seps = 0;
    or_mode = 0;
    ignore_case = 0;
    all_fields = 0;

    /* First, the initial flags. */
    while ( argn < argc && argv[argn][0] == '-' )
	{
	if ( strcmp( argv[argn], "-M" ) == 0 )
	    input_mode = IM_MERGE;
	else if ( strcmp( argv[argn], "-S" ) == 0 )
	    {
	    input_mode = IM_SCHEMA;
	    ++argn;
	    schema_file_name = argv[argn];
	    }
	else if ( strcmp( argv[argn], "-N" ) == 0 )
	    input_mode = IM_NAMEVALUE;
	else if ( strcmp( argv[argn], "-f" ) == 0 )
	    {
	    ++argn;
	    field_seps = argv[argn];
	    }
	else if ( strcmp( argv[argn], "-m" ) == 0 )
	    multiple_seps = 1;
	else if ( strcmp( argv[argn], "-o" ) == 0 )
	    or_mode = 1;
	else if ( strcmp( argv[argn], "-i" ) == 0 )
	    ignore_case = 1;
	else if ( strcmp( argv[argn], "-all" ) == 0 )
	    {
	    all_fields = 1;
	    ++argn;
	    if ( argn >= argc || strcmp( argv[argn], "-from" ) != 0 )
		usage();
	    goto got_all_fields;
	    }
	else
	    usage();
	++argn;
	}
    if ( argn >= argc )
	usage();
    if ( input_mode == IM_UNSPEC )
	{
	(void) fprintf(
	    stderr, "%s: you must specify an input format\n", argv0 );
	exit( 1 );
	}

    /* The field names. */
    field_names = strlist_new_check();
    while ( argn < argc && strcmp( argv[argn], "-from" ) != 0 )
	{
	strlist_add_check( field_names, argv[argn] );
	++argn;
	}
    if ( strlist_size( field_names ) == 0 )
	{
	(void) fprintf(
	    stderr, "%s: must specify at least one fieldname\n", argv0 );
	exit( 1 );
	}
    if ( argn >= argc )
	usage();
    got_all_fields:
    ++argn;

    /* The file names. */
    file_names = strlist_new_check();
    while ( argn < argc && strcmp( argv[argn], "-where" ) != 0 )
	{
	strlist_add_check( file_names, argv[argn] );
	++argn;
	}
    if ( strlist_size( file_names ) == 0 )
	{
	(void) fprintf(
	    stderr, "%s: must specify at least one filename\n", argv0 );
	exit( 1 );
	}

    num_conds = 0;
    max_conds = 0;
    conds = (struct cond*) 0;
    if ( argn < argc )
	{
	/* And if they're here, the conditionals. */
	max_conds = 20;
	conds = (struct cond*) malloc_check( max_conds * sizeof(struct cond) );
	++argn;
	while ( argn < argc )
	    {
	    if ( num_conds >= max_conds )
		{
		max_conds *= 2;
		conds = (struct cond*) realloc_check(
		    conds, max_conds * sizeof(struct cond) );
		}
	    cp = argv[argn];
	    cp += strcspn( cp, "=!<>" );
	    name_end = cp;
	    switch ( *cp )
		{
		case '=':
		conds[num_conds].op = OP_EQUALS;
		break;
		case '!':
		++cp;
		if ( *cp != '=' )
		    {
		    (void) fprintf( stderr, "%s: bare ! operator?\n", argv0 );
		    exit( 1 );
		    }
		conds[num_conds].op = OP_NOT_EQUALS;
		break;
		case '<':
		if ( *(cp+1) == '=' )
		    {
		    ++cp;
		    conds[num_conds].op = OP_LESS_EQUALS;
		    }
		else
		    conds[num_conds].op = OP_LESS;
		break;
		case '>':
		if ( *(cp+1) == '=' )
		    {
		    ++cp;
		    conds[num_conds].op = OP_GREATER_EQUALS;
		    }
		else
		    conds[num_conds].op = OP_GREATER;
		break;
		default:
		(void) fprintf(
		    stderr, "%s: no operator found in '%s'\n",
		    argv0, argv[argn] );
		exit( 1 );
		}
	    while ( name_end > argv[argn] &&
		    ( *(name_end-1) == ' ' || *(name_end-1) == '\t' ) )
		--name_end;
	    *name_end = '\0';
	    conds[num_conds].field_name = argv[argn];
	    ++cp;
	    while ( *cp == ' ' || *cp == '\t' )
		++cp;
	    conds[num_conds].str = strdup_check( cp );
	    if ( conds[num_conds].op == OP_EQUALS ||
	         conds[num_conds].op == OP_NOT_EQUALS )
		{
		/* Check if this is a wildcard match. */
		if ( strpbrk( conds[num_conds].str, wildcards ) != (char*) 0 )
		    {
		    int r;
		    if ( conds[num_conds].op == OP_EQUALS )
			conds[num_conds].op = OP_WILDCARD;
		    else
			conds[num_conds].op = OP_NOT_WILDCARD;
		    if ( ignore_case )
			r = regcomp(
			    &(conds[num_conds].re), conds[num_conds].str,
			    REG_BASIC|REG_NOSUB|REG_ICASE );
		    else
			r = regcomp(
			    &(conds[num_conds].re), conds[num_conds].str,
			    REG_BASIC|REG_NOSUB );
		    if ( r != 0 )
			{
			char buf[1000];
			(void) regerror(
			    r, &(conds[num_conds].re), buf, sizeof(buf) );
			(void) fprintf(
			    stderr, "%s: regexp problem - %s\n", argv0, buf );
			exit( 1 );
			}
		    }
		}
	    else
		{
		/* The op is numeric - make sure the str is too. */
		const char* cp2;
		for ( cp2 = conds[num_conds].str; *cp2 != '\0'; ++cp2 )
		    if ( ( *cp2 < '0' || *cp2 > '9' ) && *cp2 != '-' )
			{
			(void) fprintf( stderr,
			    "%s: non-numeric arg for numeric operator - '%s'\n",
			    argv0, conds[num_conds].str );
			exit( 1 );
			}
		conds[num_conds].val = atoi( conds[num_conds].str );
		}
	    ++num_conds;
	    ++argn;
	    }
	}

    if ( all_fields && input_mode == IM_NAMEVALUE )
	{
	(void) fprintf( stderr,
	    "%s: can't use -all with a name/value database\n", argv0 );
	exit( 1 );
	}

    if ( input_mode == IM_MERGE || input_mode == IM_SCHEMA )
	{
	schema = strlist_new_check();
	if ( input_mode == IM_SCHEMA )
	    {
	    /* Read the db schema. */
	    fp = fopen( schema_file_name, "r" );
	    if ( fp == (FILE*) 0 )
		{
		perror( schema_file_name );
		exit( 1 );
		}
	    init_get_line( fp );
	    line = get_line( fp );
	    if ( line == (char*) 0 )
		{
		(void) fprintf( stderr, "%s: can't read schema file\n", argv0 );
		exit( 1 );
		}
	    parse_fields( line, schema );
	    (void) fclose( fp );
	    if ( all_fields )
		field_names = schema;
	    }
	}
    else
	schema = (strlist) 0;
    record_list = strlist_new_check();

    /* Ok!  All args are now parsed.  We can start reading files. */
    record_tab = strtab_new_check();
    for ( file_num = 0; file_num < strlist_size( file_names ); ++file_num )
	{
	file_name = strlist_get( file_names, file_num );
	if ( strcmp( file_name, "-" ) == 0 )
	    fp = stdin;
	else
	    {
	    fp = fopen( file_name, "r" );
	    if ( fp == (FILE*) 0 )
		{
		perror( file_name );
		exit( 1 );
		}
	    }
	init_get_line( fp );

	if ( input_mode == IM_MERGE )
	    {
	    /* Read the schema on the first line of the file. */
	    line = get_line( fp );
	    if ( line == (char*) 0 )
		{
		(void) fprintf( stderr, "%s: can't read schema file\n", argv0 );
		exit( 1 );
		}
	    strlist_clear( schema );
	    parse_fields( line, schema );
	    if ( all_fields )
		field_names = schema;
	    }

	/* For each line in the file. */
	while ( ( line = get_line( fp ) ) != (char*) 0 )
	    {
	    /* Parse the line into fields. */
	    parse_fields( line, record_list );
	    strtab_clear( record_tab );
	    switch ( input_mode )
		{
		case IM_MERGE:
		case IM_SCHEMA:
		for ( field_num = 0; field_num < strlist_size( schema ); ++field_num )
		    {
		    field_name = strlist_get( schema, field_num );
		    field_value = strlist_get( record_list, field_num );
		    if ( field_value != (char*) 0 )
			strtab_add_check( record_tab, field_name, field_value );
		    }
		break;
		case IM_NAMEVALUE:
		for ( field_num = 0; field_num < strlist_size( record_list ); ++field_num )
		    {
		    char* fv;	/* not const */
		    field_name = strlist_get( record_list, field_num );
		    fv = strchr( field_name, '=' );
		    if ( fv != (char*) 0 )
			*fv++ = '\0';
		    else
			fv = "";
		    strtab_add_check( record_tab, field_name, fv );
		    }
		break;
		}

	    if ( num_conds > 0 )
		/* Check if we should show this record. */
		if ( ! check_conds( record_tab, conds, num_conds ) )
		    continue;

	    /* Write out the requested fields. */
	    for ( field_num = 0; field_num < strlist_size( field_names ); ++field_num )
		{
		if ( field_num != 0 )
		    putchar( field_seps[0] );
		field_name = strlist_get( field_names, field_num );
		field_value = strtab_find( record_tab, field_name );
		if ( field_value != (char*) 0 )
		    (void) fputs( field_value, stdout );
		}
	    putchar( '\n' );
	    }

	/* Done with this file. */
	if ( fp != stdin )
	    (void) fclose( fp );
	}

    /* Done. */
    strlist_delete( file_names );
    if ( ! all_fields )
	strlist_delete( field_names );
    if ( input_mode == IM_SCHEMA || input_mode == IM_MERGE )
	strlist_delete( schema );
    strlist_delete( record_list);
    strtab_delete( record_tab );
    exit( 0 );
    }


static void
usage( void )
    {
    (void) fprintf( stderr, "usage:  %s [-N|-M|-S schemafile] [-m] [-f fieldseps] [-o] fieldname ... -from filename ... -where conditional ...\n", argv0 );
    exit( 1 );
    }


static char* line;
static int line_size = 0;
static char buf[1000];
static int eof;

static void
init_get_line( FILE* fp )
    {
    if ( line_size == 0 )
	{
	line_size = 5000;
	line = (char*) malloc_check( line_size * sizeof(char) );
	}
    eof = 0;
    if ( fgets( buf, sizeof(buf), fp ) == (char*) 0 )
	eof = 1;
    }

static char*
get_line( FILE* fp )
    {
    int buf_len, line_len;
    char* cp;

    /* Loop until non-blank line. */
    do {
	if ( eof )
	    return (char*) 0;

	/* Accumulate a line. */
	line_len = 0;
	for (;;)
	    {
	    /* Trim newline. */
	    buf_len = strlen( buf );
	    while ( buf[buf_len-1] == '\n' || buf[buf_len-1] == '\r' )
		--buf_len;
	    buf[buf_len] = '\0';
	    /* Append buf to line. */
	    if ( line_len + buf_len + 1 > line_size )
		{
		line_size = ( line_len + buf_len + 1 ) * 2;
		line = (char*) realloc_check(
		    (void*) line, line_size * sizeof(char) );
		}
	    (void) strcpy( &line[line_len], buf );
	    line_len += buf_len;
	    /* Get the next buf. */
	    if ( fgets( buf, sizeof(buf), fp ) == (char*) 0 )
		{
		eof = 1;
		break;
		}
	    /* Is it a continuation? */
	    if ( strchr( field_seps, buf[0] ) == (char*) 0 )
		break;
	    }

	/* Trim comments. */
	cp = strchr( line, '#' );
	if ( cp != (char*) 0 )
	    {
	    while ( cp > line && ( *(cp-1) == ' ' || *(cp-1) == '\t' ) )
		--cp;
	    *cp = '\0';
	    }
	}
    while ( line[0] == '\0' );

    return line;
    }


static void
parse_fields( char* line, strlist list )
    {
    char* cp;
    char* cp2;
    int done;

    strlist_clear( list );
    cp = line;
    done = 0;
    for (;;)
	{
	cp2 = cp + strcspn( cp, field_seps );
	if ( *cp2 == '\0' )
	    done = 1;
	else
	    *cp2 = '\0';
	strlist_add_check( list, cp );
	if ( done )
	    return;
	cp = cp2;
	++cp;
	if ( multiple_seps )
	    cp += strspn( cp, field_seps );
	}
    }


static int
check_conds( strtab record_tab, struct cond* conds, int num_conds )
    {
    int cond_num;

    if ( or_mode )
	{
	for ( cond_num = 0; cond_num < num_conds; ++cond_num )
	    if ( check_cond( record_tab, &conds[cond_num] ) )
		return 1;
	return 0;
	}
    else
	{
	for ( cond_num = 0; cond_num < num_conds; ++cond_num )
	    if ( ! check_cond( record_tab, &conds[cond_num] ) )
		return 0;
	return 1;
	}
    }


static int
check_cond( strtab record_tab, struct cond* c )
    {
    const char* field;

    field = strtab_find( record_tab, c->field_name );
    if ( field == (char*) 0 )
	field = "";
    switch ( c->op )
	{
	case OP_EQUALS:
	if ( ignore_case )
	    return strcasecmp( field, c->str ) == 0;
	else
	    return strcmp( field, c->str ) == 0;
	case OP_NOT_EQUALS:
	if ( ignore_case )
	    return strcasecmp( field, c->str ) != 0;
	else
	    return strcmp( field, c->str ) != 0;
	case OP_WILDCARD:
	return regexec( &(c->re), field, 0, (regmatch_t*) 0, 0 ) != REG_NOMATCH;
	case OP_NOT_WILDCARD:
	return regexec( &(c->re), field, 0, (regmatch_t*) 0, 0 ) == REG_NOMATCH;
	case OP_LESS:
	return atoi( field ) < c->val;
	case OP_LESS_EQUALS:
	return atoi( field ) <= c->val;
	case OP_GREATER:
	return atoi( field ) > c->val;
	case OP_GREATER_EQUALS:
	return atoi( field ) >= c->val;
	}
    return 1;
    }


static void*
malloc_check( size_t size )
    {
    void* ptr;

    ptr = malloc( size );
    check( ptr );
    return ptr;
    }


static void*
realloc_check( void* ptr, size_t size )
    {
    ptr = realloc( ptr, size );
    check( ptr );
    return ptr;
    }


static const char*
strdup_check( const char* str )
    {
    str = strdup( str );
    check( (void*) str );
    return str;
    }


static strtab
strlist_new_check( void )
    {
    strlist s;

    s = strlist_new();
    check( (void*) s );
    return s;
    }


static void
strlist_add_check( strlist s, const char* str )
    {
    if ( ! strlist_add( s, str ) )
	{
	(void) fprintf( stderr, "%s: out of memory\n", argv0 );
	exit( 1 );
	}
    }


static strtab
strtab_new_check( void )
    {
    strtab s;

    s = strtab_new();
    check( (void*) s );
    return s;
    }


static void
strtab_add_check( strtab s, const char* name, const char* value )
    {
    if ( ! strtab_add( s, name, value ) )
	{
	(void) fprintf( stderr, "%s: out of memory\n", argv0 );
	exit( 1 );
	}
    }


static void
check( void* ptr )
    {
    if ( ptr == (void*) 0 )
	{
	(void) fprintf( stderr, "%s: out of memory\n", argv0 );
	exit( 1 );
	}
    }
