/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal.  All Rights Reserved. */
/* ./glimpse/index/glimpse.c */
#include "glimpse.h"
#include <stdlib.h>
#include <sys/time.h>
#if	ISO_CHAR_SET
#include <locale.h>	/* support for 8bit character set:ew@senate.be */
#endif

extern char **environ;
extern int errno;
#if	BG_DEBUG
extern FILE  *LOGFILE; 	/* file descriptor for LOG output */
#endif	/*BG_DEBUG*/
extern FILE  *STATFILE;	/* file descriptor for statistical data about indexed files */
extern FILE  *MESSAGEFILE;	/* file descriptor for important messages meant for the user */
extern char  INDEX_DIR[MAX_LINE_LEN];
extern struct stat istbuf;

extern int indexable_char[256];
extern int GenerateHash;
extern int KeepFilenames;
extern int OneFilePerBlock;
extern int IndexNumber;
extern int CountWords;
extern int StructuredIndex;
extern int MAXWORDSPERFILE;
extern int NUMERICWORDPERCENT;
extern int AddToIndex;
extern int DeleteFromIndex;
extern int PurgeIndex;
extern int FastIndex;
extern int BuildDictionary;
extern int BuildDictionaryExisting;
extern int CompressAfterBuild;
extern int IncludeHigherPriority;
extern int FilenamesOnStdin;
extern int UseFilters;
extern int ByteLevelIndex;
/* extern int IndexUnderscore; */
extern int IndexableFile;
extern int MAX_PER_MB, MAX_INDEX_PERCENT;
extern int I_THRESHOLD;
extern int BigHashTable;
extern int IndexEverything;
extern int BuildTurbo;

extern int AddedMaxWordsMessage;
extern int AddedMixedWordsMessage;

extern int file_num;
extern int old_file_num;
extern int new_file_num;
extern int file_id;
extern int part_num;
extern char **name_list[MAXNUM_INDIRECT];
extern int p_table[MAX_PARTITION];
extern int  *size_list[MAXNUM_INDIRECT];
extern int p_size_list[];
extern unsigned int *disable_list;
extern int memory_usage;
extern int mask_int[];
extern int REAL_PARTITION, REAL_INDEX_BUF, MAX_ALL_INDEX, FILEMASK_SIZE;
extern struct indices	*deletedlist;
extern char sync_path[MAX_LINE_LEN];

extern set_usemalloc();	/* compress/misc.c */

char IProgname[MAX_LINE_LEN];

/*
 * Has newnum crossed the boundary of an encoding? This is so rare that we
 * needn't optimize it by changing the format of the old index and reusing it.
 */
cross_boundary(oldnum, newnum)
	int	oldnum, newnum;
{
	int	ret;

	if (oldnum <= 0) return 0;
	ret =  ( ((oldnum <= MaxNum8bPartition) && (newnum > MaxNum8bPartition)) ||
		 ((oldnum <= MaxNum12bPartition) && (newnum > MaxNum12bPartition)) ||
		 ((oldnum <= MaxNum16bPartition) && (newnum > MaxNum16bPartition)) );
	if (ret) fprintf(MESSAGEFILE, "Must change index format. Commencing fresh indexing...\n");
	return ret;
}

determine_sync()
{
	char	S[1024], s1[256], s2[256];
	FILE	*fp;
	int	i, ret;

	strcpy(sync_path, "sync");
	sprintf(S, "exec whereis sync > /tmp/zz.%d", getpid());
	system(S);
	sprintf(S, "/tmp/zz.%d", getpid());
	if ((fp = fopen(S, "r")) == NULL) {
		/* printf("11111\n"); */
		return 0;
	}
	if ((ret = fread(S, 1, 1024, fp)) <= 0) {
		sprintf(S, "/tmp/zz.%d", getpid());
		unlink(S);
		fclose(fp);
		/* printf("22222\n"); */
		return 0;
	}
	sprintf(s1, "/tmp/zz.%d", getpid());
	unlink(s1);
	fclose(fp);
	/* printf("read: %s\n", S); */

	sscanf(S, "%s%s", s1, s2);
	/* printf("s1=%s s2=%s\n", s1, s2); */
	if (strncmp(s1, "sync", 4)) {
		/* printf("33333\n"); */
		return 0;
	}
	if (!strcmp(s2, "") || !strcmp(s2, " ")) {
		/* printf("44444\n"); */
		return 0;
	}
	if (strstr(s2, "sync") == NULL) {
		/* printf("55555\n"); */
		return 0;
	}
	strcpy(sync_path, s2);
	/* printf("Using sync in: %s\n", sync_path); */
	return 1;
}

main(argc, argv)
int argc;
char **argv;
{
    int pid = getpid();
    int	i, m = 0;
    char *indexdir;
    char s[MAX_LINE_LEN], s1[MAX_LINE_LEN];
    char working_dir[MAX_LINE_LEN];
    FILE *tmpfp;
    char hash_file[MAX_LINE_LEN], string_file[MAX_LINE_LEN], freq_file[MAX_LINE_LEN];
    char tmpbuf[1024];
    struct stat stbuf;
    char name[MAX_LINE_LEN];
    char outname[MAX_LINE_LEN];
    int specialwords, threshold;
    int backup;
    struct indices *get_removed_indices();
    struct timeval tv;

#if	ISO_CHAR_SET
    setlocale(LC_ALL,""); /* support for 8bit character set: ew@senate.be, Henrik.Martin@eua.ericsson.se */
#endif
    BuildDictionary = ON;
    set_usemalloc();
    srand(pid);
    umask(077);
    determine_sync();

    INDEX_DIR[0] = '\0';
    specialwords = threshold = -1;	/* so that compute_dictionary can use defaults not visible here */
    strncpy(IProgname, argv[0], MAX_LINE_LEN);
    memset(size_list, '\0', sizeof(int *) * MAXNUM_INDIRECT);	/* free it once partition successfully calculates p_size_list */
    memset(name_list, '\0', sizeof(char **) * MAXNUM_INDIRECT);
    memset(p_size_list, '\0', sizeof(int) * MAX_PARTITION);
    build_filename_hashtable((char *)NULL, 0);

    /*
     * Process options.
     */

    while (argc > 1) {
	if (strcmp(argv[1], "-help") == 0) {
	    return usage(1);
	}
#if	!BUILDCAST
	else if (strcmp(argv[1], "-V") == 0) {
	    printf("\nThis is glimpseindex version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);
	    return(0);
	}
	else if (strcmp(argv[1], "-T") == 0) {
	    BuildTurbo = ON;
	    argc --; argv ++;
	}
	else if (strcmp(argv[1], "-I") == 0) {
	    IndexableFile = ON;
	    argc --; argv ++;
	}
	else if(strcmp(argv[1], "-a") == 0) {
	    AddToIndex = ON;
	    argc--; argv++;
	}
	else if(strcmp(argv[1], "-b") == 0) {
	    ByteLevelIndex = ON;
	    argc--; argv++;
	}
	else if(strcmp(argv[1], "-c") == 0) {
	    CountWords = ON;
	    argc--; argv++;
	}
	else if(strcmp(argv[1], "-d") == 0) {
	    DeleteFromIndex = ON;
	    argc --; argv ++;
	}
	else if(strcmp(argv[1], "-D") == 0) {
	    PurgeIndex = OFF;
	    argc --; argv ++;
	}
	else if(strcmp(argv[1], "-f") == 0) {
	    FastIndex = ON;
	    argc--; argv++;
	}
	else if (strcmp(argv[1], "-o") == 0) {
	    OneFilePerBlock = ON;
	    argc --; argv ++;
	}
	else if (strcmp(argv[1], "-s") == 0) {
	    StructuredIndex = ON;
	    argc --; argv ++;
	}
	else if(strcmp(argv[1], "-z") == 0) {
	    UseFilters = ON;
	    argc--; argv++;
	}
#else	/*!BUILDCAST*/
	else if (strcmp(argv[1], "-V") == 0) {
	    printf("\nThis is buildcast version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);
	    return(0);
	}
	else if(strcmp(argv[1], "-C") == 0) {
	    CompressAfterBuild = ON;
	    argc --; argv ++;
	}
	else if(strcmp(argv[1], "-E") == 0) {
	    BuildDictionaryExisting = ON;
	    argc --; argv ++;
	}
	else if (strcmp(argv[1], "-t") == 0) {
	    if ((argc <= 2) || !(isdigit(argv[2][0]))) {
		return usage(1);
	    }
	    else {
		threshold = atoi(argv[2]);
		argc -= 2; argv += 2;
	    }
	}
	else if (strcmp(argv[1], "-l") == 0) {
	    if ((argc <= 2) || !(isdigit(argv[2][0]))) {
		return usage(1);
	    }
	    else {
		specialwords = atoi(argv[2]);
		argc -= 2; argv += 2;
	    }
	}
#endif	/*!BUILDCAST*/
	else if (strcmp(argv[1], "-M") == 0) {
	    if (argc == 2) {
		fprintf(stderr, "-M should be followed by the amount of memory in MB for indexing words\n");
		return usage(1);
	    }
	    m = atoi(argv[2]);
	    if (m < 1) {
		fprintf(stderr, "Ignoring -M %d (< 1 MB). Using default value of about 2 MB\n", m);
		return usage(1);
	    }
	    else {
		/*
		 * Calculate I_THRESHOLD approximately. Note: 2*1024*1024*2 / (2*24 + 32 + 12) = 47662, DEF_I_THRESHOLD = 40000, so OK
		 * N * sizeofindices + N*(avgwordlen + sizeoftoken)/indicespertoken <= mem
		 * elemsperset = occurrences/indicespertoken
		 * N <= mem * occurrences / (sizeofindices*indicespertoken + avgwordlen + sizeoftoken)
		 */
		I_THRESHOLD = m * 1024 * 1024 * (INDICES_PER_TOKEN) /
				(INDICES_PER_TOKEN * sizeof(struct indices) + sizeof(struct token) + AVG_WORD_LEN);
		fprintf(stderr, "Using %d words as threshold before merge\n", I_THRESHOLD/INDICES_PER_TOKEN);
	    }
	    argc -= 2; argv += 2;
	}
	else if (strcmp(argv[1], "-w") == 0) {
	    if (argc == 2) {
		fprintf(stderr, "-w should be followed by the number of words\n");
		return usage(1);
	    }
	    MAXWORDSPERFILE = atoi(argv[2]);
	    argc -= 2; argv += 2;
	}
	else if (strcmp(argv[1], "-S") == 0) {
	    if (argc == 2) {
		fprintf(stderr, "-S should be followed by the stop list limit\n");
		return usage(1);
	    }
	    MAX_PER_MB = MAX_INDEX_PERCENT = atoi(argv[2]);
	    argc -= 2; argv += 2;
	}
	else if(strcmp(argv[1], "-n") == 0) {
	    IndexNumber = ON;
	    if ((argc <= 2) || !(isdigit(argv[2][0]))) {	/* -n has no arg */
		argc --; argv ++;
	    }
	    else {
		NUMERICWORDPERCENT = atoi(argv[2]);
		if ((NUMERICWORDPERCENT > 100) || (NUMERICWORDPERCENT < 0)) {
		    fprintf(stderr, "The percentage of numeric words must be in [0..100]\n");
		    return usage(1);
		}
		argc-=2; argv+=2;
	    }
	}
	else if(strcmp(argv[1], "-h") == 0) {
	    /* I want to generate .glimpse_filehash and .glimpse_filehash_index */
	    GenerateHash = ON;
	    argc --; argv ++;
	}
	else if(strcmp(argv[1], "-i") == 0) {
	    IncludeHigherPriority = ON;
	    argc --; argv ++;
	}
	else if(strcmp(argv[1], "-k") == 0) {
	    /* I want to know what files were there before: used in SFS to compute new sets from old ones */
	    KeepFilenames = ON;
	    argc --; argv ++;
	}
	else if (strcmp(argv[1], "-B") == 0) {
		BigHashTable = 1;
		argc --; argv ++;
	}
	else if (strcmp(argv[1], "-E") == 0) {
		IndexEverything = 1;	/* without doing stat tests, etc. */
		argc --; argv ++;
	}
	else if(strcmp(argv[1], "-F") == 0) {
	    FilenamesOnStdin = ON;
	    argc--; argv++;
	}
	/*
	else if(strcmp(argv[1], "-u") == 0) {
	    IndexUnderscore = ON;
	    argc--; argv++;
	}
	*/
	else if (strcmp(argv[1], "-H") == 0) {
	    if (argc == 2) {
		fprintf(stderr, "-H should be followed by a directory name\n");
		return usage(1);
	    }
	    strncpy(INDEX_DIR, argv[2], MAX_LINE_LEN);
	    argc -= 2; argv += 2;
	}
	else break;	/* rest are directory names */
    }

    BuildTurbo = ON;	/* always ON: user can remove .glimpse_turbo if not needed */
    /*
     * Look for invalid option combos.
     */

    if ((argc<=1) && (!FilenamesOnStdin) && !FastIndex) {
	return usage(1);
    }

    if (DeleteFromIndex && (AddToIndex || CountWords || IndexableFile)) {
	/* With -f, it is automatic for files not found in OS but present in index; without it, an explicit set of files is required as argument on cmdline */
	fprintf(stderr, "-d cannot be used with -I, -a or -c (see man pages)\n");
	exit(2);
    }

    if (ByteLevelIndex) {
	if (MAX_PER_MB <= 0) {
	    fprintf(stderr, "Stop list limit (#of occurrences per MB) '%d' must be > 0\n", MAX_PER_MB);
	    exit(2);
	}
    }
    else if (OneFilePerBlock) {
	if ((MAX_INDEX_PERCENT <= 0) || (MAX_INDEX_PERCENT > 100)) {
	    fprintf(stderr, "Stop list limit (%% of occurrences in files) '%d' must be in (0, 100]\n", MAX_INDEX_PERCENT);
	    exit(2);
	}
    }

    /*
     * Find the index directory since it is used in all options.
     */

    if (INDEX_DIR[0] == '\0') {
	if ((indexdir = getenv("HOME")) == NULL) {
	    getcwd(INDEX_DIR, MAX_LINE_LEN-1);
	    fprintf(stderr, "Using working-directory '%s' to store index\n\n", INDEX_DIR);
	}
	else strncpy(INDEX_DIR, indexdir, MAX_LINE_LEN);
    }
    getcwd(working_dir, MAX_LINE_LEN - 1);
    if (-1 == chdir(INDEX_DIR)) {
	fprintf(stderr, "Cannot change directory to %s\n", INDEX_DIR);
	return usage(0);
    }
    getcwd(INDEX_DIR, MAX_LINE_LEN - 1);	/* must be absolute path name */
    chdir(working_dir);	/* get back to where you were */

    if (IndexableFile) {	/* traverse the given directories and output names of files that are indexable on stdout */
	partition(argc, argv);
	return 0;
    }
    else {
#if	BUILDCAST
	printf("\nThis is buildcast version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);
#else	/*BUILDCAST*/
	printf("\nThis is glimpseindex version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);
#endif	/*BUILDCAST*/
    }

    if (ByteLevelIndex) {
#if	0
	/* We'll worry about these things later */
	if (AddToIndex || DeleteFromIndex || FastIndex) {
	    fprintf(stderr, "Fresh indexing recommended: -a, -d and -f are not supported with -b as yet\n");
	    exit(1);
	}
	AddToIndex = FastIndex = OFF;
#endif
	CountWords = OFF;
	OneFilePerBlock = ON;
    }

    /*
     * CONVENTION: all the relevant output is on stdout; warnings/errors are on stderr.
     * Initialize / open important files.
     */

    read_filters(INDEX_DIR, UseFilters);

    freq_file[0] = hash_file[0] = string_file[0] = '\0';
    strcpy(freq_file, INDEX_DIR);
    strcat(freq_file, "/");
    strcat(freq_file, DEF_FREQ_FILE);
    strcpy(hash_file, INDEX_DIR);
    strcat(hash_file, "/");
    strcat(hash_file, DEF_HASH_FILE);
    strcpy(string_file, INDEX_DIR);
    strcat(string_file, "/");
    strcat(string_file, DEF_STRING_FILE);
    initialize_tuncompress(string_file, freq_file, 0);

#if	BG_DEBUG
    sprintf(s, "%s/%s", INDEX_DIR, DEF_LOG_FILE);
    if((LOGFILE = fopen(s, "w")) == 0) {
	fprintf(stderr, "can't open %s for writing\n", s);
	LOGFILE = stderr;
    }
#endif	/*BG_DEBUG*/

    sprintf(s, "%s/%s", INDEX_DIR, DEF_MESSAGE_FILE);
    if((MESSAGEFILE = fopen(s, "w")) == 0) {
	fprintf(stderr, "can't open %s for writing\n", s);
	MESSAGEFILE = stderr;
    }

    sprintf(s, "%s/%s", INDEX_DIR, DEF_STAT_FILE);
    if((STATFILE = fopen(s, "a")) == 0) {
	fprintf(stderr, "can't open %s for appending\n", s);
	STATFILE = stderr;
    }
    gettimeofday(&tv, NULL);
#if	BUILDCAST
    fprintf(STATFILE, "\nThis is buildcast version %s, %s. %s", GLIMPSE_VERSION, GLIMPSE_DATE, ctime(&tv.tv_sec));
#else
    fprintf(STATFILE, "\nThis is glimpseindex version %s, %s. %s", GLIMPSE_VERSION, GLIMPSE_DATE, ctime(&tv.tv_sec));
#endif

#if	BG_DEBUG
    fprintf(LOGFILE, "Index Directory = %s\n\n", INDEX_DIR);
#endif	/*BG_DEBUG*/
    if (MAXWORDSPERFILE != 0) fprintf(MESSAGEFILE, "Index: maximum number of indexed words per file = %d\n", MAXWORDSPERFILE);
    else fprintf(MESSAGEFILE, "Index: maximum number of indexed words per file = infinity\n");
    fprintf(MESSAGEFILE, "Index: maximum percentage of numeric words per file = %d\n", NUMERICWORDPERCENT);

    set_indexable_char(indexable_char);

#if	BUILDCAST

    CountWords = ON;
    AddToIndex = OFF;
    FastIndex = OFF;

    /* Save old search-dictionaries */

    sprintf(s, "%s/.glimpse_index", INDEX_DIR);
    if (!access(s, R_OK)) {
	sprintf(s, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
	if (-1 == mkdir(s, 0700)) {
	    fprintf(stderr, "cannot create temporary directory %s\n", s);
	    return -1;
	}
#if	SFS_COMPAT
	sprintf(s, "%s/%s", INDEX_DIR, INDEX_FILE);
	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
	rename(s, s1);
#else
	sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, INDEX_FILE, INDEX_DIR, pid);
	system(s);
#endif
#if	SFS_COMPAT
	sprintf(s, "%s/%s", INDEX_DIR, P_TABLE);
	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
	rename(s, s1);
#else
	sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, P_TABLE, INDEX_DIR, pid);
	system(s);
#endif
#if	SFS_COMPAT
	sprintf(s, "%s/%s", INDEX_DIR, NAME_LIST);
	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
	rename(s, s1);
#else
	sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, NAME_LIST, INDEX_DIR, pid);
	system(s);
#endif
#if	SFS_COMPAT
	sprintf(s, "%s/%s", INDEX_DIR, NAME_LIST_INDEX);
	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
	rename(s, s1);
#else
	sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, NAME_LIST_INDEX, INDEX_DIR, pid);
	system(s);
#endif
#if	SFS_COMPAT
	sprintf(s, "%s/%s", INDEX_DIR, NAME_HASH);
	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
	rename(s, s1);
#else
	sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, NAME_HASH, INDEX_DIR, pid);
	system(s);
#endif
#if	SFS_COMPAT
	sprintf(s, "%s/%s", INDEX_DIR, NAME_HASH_INDEX);
	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
	rename(s, s1);
#else
	sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, NAME_HASH_INDEX, INDEX_DIR, pid);
	system(s);
#endif
#if	SFS_COMPAT
	sprintf(s, "%s/%s", INDEX_DIR, MINI_FILE);
	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
	rename(s, s1);
#else
	sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, MINI_FILE, INDEX_DIR, pid);
	system(s);
#endif
#if	SFS_COMPAT
	sprintf(s, "%s/%s", INDEX_DIR, DEF_STAT_FILE);
	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
	rename(s, s1);
#else
	sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, DEF_STAT_FILE, INDEX_DIR, pid);
	system(s);
#endif
	/* Don't save messages, log, debug, etc. */
	sprintf(s, "%s/.glimpse_attributes", INDEX_DIR);
	if (!access(s, R_OK)) {
#if	SFS_COMPAT
	    sprintf(s, "%s/%s", INDEX_DIR, ATTRIBUTE_FILE);
	    sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
	    rename(s, s1);
#else
	    sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, ATTRIBUTE_FILE, INDEX_DIR, pid);
	    system(s);
#endif
	}
    }

    /* Backup old cast-dictionaries: don't use move since indexing might want to use them */
    sprintf(s, "%s/.glimpse_quick", INDEX_DIR);
    if (!access(s, R_OK)) {	/* there are previous cast dictionaries */
	backup = rand();
	sprintf(s, "%s/.glimpse_backup.%x", INDEX_DIR, backup);
	if (-1 == mkdir(s, 0700)) {
	    fprintf(stderr, "cannot create backup directory %s\n", s);
	    return -1;
	}
	sprintf(s, "exec %s -f %s/.glimpse_quick %s/.glimpse_backup.%x\n", SYSTEM_CP, INDEX_DIR, INDEX_DIR, backup);
	system(s);
	sprintf(s, "exec %s -f %s/.glimpse_compress %s/.glimpse_backup.%x\n", SYSTEM_CP, INDEX_DIR, INDEX_DIR, backup);
	system(s);
	sprintf(s, "exec %s -f %s/.glimpse_compress.index %s/.glimpse_backup.%x\n", SYSTEM_CP, INDEX_DIR, INDEX_DIR, backup);
	system(s);
	sprintf(s, "exec %s -f %s/.glimpse_uncompress %s/.glimpse_backup.%x\n", SYSTEM_CP, INDEX_DIR, INDEX_DIR, backup);
	system(s);
	sprintf(s, "exec %s -f %s/.glimpse_uncompress.index %s/.glimpse_backup.%x\n", SYSTEM_CP, INDEX_DIR, INDEX_DIR, backup);
	system(s);
	printf("Saved previous cast-dictionary in %s/.glimpse_backup.%x\n", INDEX_DIR, backup);
    }

    /* Now index these files, and build new dictionaries */
    partition(argc, argv);
    initialize_data_structures(file_num);
    old_file_num = file_num;
    build_index();

    cleanup();
    save_data_structures();
    destroy_filename_hashtable();
    uninitialize_common();
    uninitialize_tcompress();
    uninitialize_tuncompress();
    compute_dictionary(threshold, DISKBLOCKSIZE, specialwords, INDEX_DIR);

    if (CompressAfterBuild) {
	/* For the new compression */
	if (!initialize_tcompress(hash_file, freq_file, TC_ERRORMSGS)) goto docleanup;
	printf("Compressing files with new dictionary...\n");
	/* Use the set of file-names collected during partition() / modified during build_hash */
	for(i=0; i<file_num; i++) {
	    if ((disable_list != NULL) && (disable_list[block2index(i)] & mask_int[i%(8*sizeof(int))])) continue;	/* nop since disable_list IS NULL */
	    strcpy(name, LIST_GET(name_list, i));
	    tcompress_file(name, outname, TC_REMOVE | TC_EASYSEARCH | TC_OVERWRITE | TC_NOPROMPT);
	}
    }

docleanup:
    /* Restore old search-dictionaries */
    sprintf(s, "%s/.glimpse_tempdir.%d/.glimpse_index", INDEX_DIR, pid);
    if (!access(s, R_OK)) {
#if	SFS_COMPAT
	sprintf(s1, "%s/%s", INDEX_DIR, INDEX_FILE);
	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, INDEX_FILE);
	rename(s, s1);
	sprintf(s1, "%s/%s", INDEX_DIR, P_TABLE);
	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, P_TABLE);
	rename(s, s1);
	sprintf(s1, "%s/%s", INDEX_DIR, NAME_LIST);
	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, NAME_LIST);
	rename(s, s1);
	sprintf(s1, "%s/%s", INDEX_DIR, NAME_LIST_INDEX);
	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, NAME_LIST_INDEX);
	rename(s, s1);
	sprintf(s1, "%s/%s", INDEX_DIR, NAME_HASH);
	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, NAME_HASH);
	rename(s, s1);
	sprintf(s1, "%s/%s", INDEX_DIR, NAME_HASH_INDEX);
	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, NAME_HASH_INDEX);
	rename(s, s1);
	sprintf(s1, "%s/%s", INDEX_DIR, MINI_FILE);
	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, MINI_FILE);
	rename(s, s1);
	sprintf(s1, "%s/%s", INDEX_DIR, DEF_STAT_FILE);
	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, DEF_STAT_FILE);
	rename(s, s1);
	sprintf(s1, "%s/%s", INDEX_DIR, ATTRIBUTE_FILE);
	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, ATTRIBUTE_FILE);
	rename(s, s1);
#else
	sprintf(s, "exec %s -f %s/.glimpse_tempdir.%d/.glimpse_* %s\n", SYSTEM_MV, INDEX_DIR, pid, INDEX_DIR);
	system(s);
#endif
	sprintf(s, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
	rmdir(s);
    }
    printf("\nBuilt new cast-dictionary in %s\n", INDEX_DIR);

#else	/*BUILDCAST*/

    if (AddToIndex || DeleteFromIndex || FastIndex) {
	/* Not handling byte level indices here for now */
	int	indextype;

	sprintf(s, "%s/%s", INDEX_DIR, INDEX_FILE);
	if (-1 == stat(s, &istbuf)) {
	    if (AddToIndex || DeleteFromIndex) {
		fprintf(stderr, "Cannot find previous index! Fresh indexing recommended\n", s);
		return usage(0);
	    }
	    file_num = 0;
	    file_id = 0;
	    part_num = 1;
	    goto fresh_indexing;
	}

	/* Find out existing index of words and partitions/filenumbers */
	if ((indextype = get_index_type(s)) < 0) {
#if	0
	    fprintf(stderr, "Fresh indexing recommended: -a and -f are not supported with -b as yet\n");
	    exit(1);
	    /* we support it now */
#endif
	}
	file_num = part_num = 0;
	sprintf(s, "%s/%s", INDEX_DIR, NAME_LIST);
	file_num = get_array_of_lines(s, name_list, MaxNum24bPartition, 1);
	initialize_disable_list(file_num);
	initialize_data_structures(file_num);

	if (!indextype) {
		sprintf(s, "%s/%s", INDEX_DIR, P_TABLE);
		part_num = get_table(s, p_table, MAX_PARTITION, 1) - 1;	/* part_num INCLUDES last partition */
	}
	else merge_splits();

	/* Check for errors, Set OneFilePerBlock */
	if ( (file_num <= 0) || (!indextype && (part_num <= 0)) ) {
	    if (AddToIndex || DeleteFromIndex) {
		fprintf(stderr, "Cannot find previous glimpseindex files! Fresh indexing recommended\n");
		return usage(0);
	    }
	    file_num = 0;
	    file_id = 0;
	    part_num = 1;
	    my_free(disable_list);
	    disable_list = NULL;
	    goto fresh_indexing;
	}
	if (OneFilePerBlock && !indextype) {
	    fprintf(stderr, "Warning: ignoring option -o: using format of existing index\n");
	}
	OneFilePerBlock = abs(indextype);
	if (indextype < 0) ByteLevelIndex = ON;

	/* Used in FastIndex for all existing files, used in AddToIndex/DeleteFromIndex if we are trying to add/remove an existing file */
	build_filename_hashtable(name_list, file_num);

#if	0
	/* Test if these are inverses of each other */
	save_data_structures();
	merge_splits();
#endif	/*0*/

	/*
	 * FastIndex: set disable-flag for unchanged files: remove AND
	 * disable non-existent files. Let hole remain in file-names/partitions.
	 */
	if (FastIndex) {
	    for (i=0; i<file_num; i++)
		if (-1 == stat(LIST_GET(name_list, i), &stbuf)) {
			remove_filename(i, -1);
		}
		else if (((stbuf.st_mode & S_IFMT) == S_IFREG) && (stbuf.st_ctime <= istbuf.st_ctime)) {
		    /* This is just used as a cache since exclude/include processing is not done here: see dir.c */
		    disable_list[block2index(i)] |= mask_int[i % (8*sizeof(int))];
		}
		else {
		    /* Can't do it for directories since files in it can be modified w/o date reflected in the directory. Same for symlinks. */
		    LIST_ADD(size_list, i, stbuf.st_size, int);
		    disable_list[block2index(i)] &= ~(mask_int[i % (8*sizeof(int))]);
		}
	}
	/*
	 * AddToIndex without FastIndex: disable all existing files, remove those that don't exist now.
	 * Out of old ones, only ADDED FILES are re-enabled: dir.c
	 */
	else if (AddToIndex) {
	    for (i=0; i<file_num; i++) {
		if (-1 == stat(LIST_GET(name_list, i), &stbuf)) {
		    remove_filename(i, -1);
		}
		else {
		    LIST_ADD(size_list, i, stbuf.st_size, int);	/* ONLY for proper statistics in save_data_structures() */
		    disable_list[block2index(i)] |= mask_int[i % (8*sizeof(int))];
		}
	    }
	}
	/* else: DeleteFromIndex without FastIndex: don't touch other files */

	old_file_num = file_num;
	destroy_data_structures();

	/* Put old/new files into partitions/filenumbers */
	if (-1 == oldpartition(argc, argv)) {
	    for(i=0;i<file_num;i++) {
#if	BG_DEBUG
		memory_usage -= (strlen(LIST_GET(name_list, i)) + 2);
#endif	/*BG_DEBUG*/
		if (LIST_GET(name_list, i) != NULL) {
			my_free(LIST_GET(name_list, i), 0);
			LIST_SUREGET(name_list, i) = NULL;
		}
	    }
	    file_num = 0;
	    file_id = 0;
	    for (i=0;i<part_num; i++) {
		p_table[i] = 0;
	    }
	    part_num = 1;
	    my_free(disable_list);
	    disable_list = NULL;
	    goto fresh_indexing;
	}

	/* Reindex all the files but use the file-names obtained with oldpartition() */
	if (cross_boundary(OneFilePerBlock, file_num)) {
	    my_free(disable_list);
	    disable_list = NULL;
	}

	initialize_data_structures(file_num);
	if (!DeleteFromIndex || FastIndex) build_index();
	if ((deletedlist = get_removed_indices()) == NULL) new_file_num = file_num;
	else if (PurgeIndex) new_file_num = purge_index();

#if	BG_DEBUG
	fprintf(LOGFILE, "Built indices in %s/%s\n", INDEX_DIR, INDEX_FILE);
#endif	/*BG_DEBUG*/
	goto docleanup;
    }

fresh_indexing:
    /* remove it to create space since it can be large: don't need for fresh indexing */
    sprintf(s, "%s/%s", INDEX_DIR, P_TABLE);
    unlink(s);
    /* These should be zeroed since they can confuse fsize and fsize_directory() */
    AddToIndex = 0;
    FastIndex = 0;
#if	BG_DEBUG
    fprintf(LOGFILE, "Commencing fresh indexing\n");
#endif	/*BG_DEBUG*/
    partition(argc, argv);
    destroy_filename_hashtable();
    initialize_data_structures(file_num);
    old_file_num = file_num;
    build_index();
#if	BG_DEBUG
    fprintf(LOGFILE, "\nBuilt indices in %s/%s\n", INDEX_DIR, INDEX_FILE);
#endif	/*BG_DEBUG*/

docleanup:
    cleanup();
    save_data_structures();
    destroy_filename_hashtable();
#if	BG_DEBUG
    fflush(LOGFILE);
    fclose(LOGFILE);
#endif	/*BG_DEBUG*/
    fflush(MESSAGEFILE);
    fclose(MESSAGEFILE);
    fflush(STATFILE);
    fclose(STATFILE);
    if (AddedMaxWordsMessage) printf("\nSome files contributed > %d words to the index: check %s\n", MAXWORDSPERFILE, DEF_MESSAGE_FILE);
    if (AddedMixedWordsMessage) printf("Some files had numerals in > %d%% of the indexed words: check %s\n", NUMERICWORDPERCENT, DEF_MESSAGE_FILE);

    printf("\nIndex-directory: \"%s\"\nGlimpse-files created here:\n", INDEX_DIR);
    chdir(INDEX_DIR);
    sprintf(s, "exec %s -lg .glimpse_* > /tmp/%d\n", SYSTEM_LS, pid);
    system(s);
    sprintf(s, "/tmp/%d", pid);
    if ((tmpfp = fopen(s, "r")) != NULL) {
	memset(tmpbuf, '\0', 1024);
	while(fgets(tmpbuf, 1024, tmpfp) != NULL) fputs(tmpbuf, stdout);
	fflush(tmpfp);
	fclose(tmpfp);
	unlink(s);
    }
    else fprintf(stderr, "cannot open %s to `cat': check %s for .glimpse - files\n", s, INDEX_DIR);
#endif	/*BUILDCAST*/

    return 0;
}

cleanup()
{
    char s[MAX_LINE_LEN];

    sprintf(s, "%s/%s", INDEX_DIR, I1);
    unlink(s);
    sprintf(s, "%s/%s", INDEX_DIR, I2);
    unlink(s);
    sprintf(s, "%s/%s", INDEX_DIR, I3);
    unlink(s);
    sprintf(s, "%s/%s", INDEX_DIR, O1);
    unlink(s);
    sprintf(s, "%s/%s", INDEX_DIR, O2);
    unlink(s);
    sprintf(s, "%s/%s", INDEX_DIR, O3);
    unlink(s);
    sprintf(s, "%s/.glimpse_apply.%d", INDEX_DIR, getpid());
    unlink(s);
}

#if	!BUILDCAST
usage(flag)
int	flag;
{
	if (flag) fprintf(stderr, "\nThis is glimpseindex version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);
	fprintf(stderr, "usage: %s [-help] [-a] [-d] [-f] [-i] [-n [#]] [-o] [-s] [-w #] [-B] [-F] [-H dir] [-I] [-M] [-S lim] [-T] [-V] dirs/files\n", IProgname);
        fprintf(stderr, "summary of frequently used options\n(for a more detailed listing see 'man glimpse'):\n");
	fprintf(stderr, "-help: outputs this menu\n");
        fprintf(stderr, "-a: add given files/dirs to an existing index\n");
        fprintf(stderr, "-d: delete given files/dirs from an existing index\n");
	fprintf(stderr, "-b: build a (large) byte level index to speed up search\n");
        fprintf(stderr, "-f: use modification dates to do fast indexing\n");
        fprintf(stderr, "-n #: index numbers; warn if file adds > #%% numeric words: default is 50\n");
        fprintf(stderr, "-o: optimize for speed by building a larger index\n");
	/* fprintf(stderr, "-s: build the index for structured queries (a1=v1 &/| a2=v2...)\n"); this should not be advertised */
        fprintf(stderr, "-w #: warn if a file adds > # words to the index\n");
	fprintf(stderr, "-F: expect filenames on stdin (useful for pipelining)\n");
        fprintf(stderr, "-H 'dir': .glimpse-files should be in directory 'dir': default is '~'\n");
	fprintf(stderr, "-T: build .glimpse_turbo for very fast search with -i -w in glimpse\n");
	fprintf(stderr, "\n");
	fprintf(stderr, "For questions about glimpse, please contact `%s'\n", GLIMPSE_EMAIL);
	exit(1);
}
#else	/*!BUILDCAST*/
usage(flag)
int	flag;
{
	if (flag) fprintf(stderr, "\nThis is buildcast version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);
	fprintf(stderr, "usage: %s [-help] [-t] [-i] [-l] [-n [#]] [-w #] [-C] [-E] [-F] [-H dir] [-V] dirs/files\n", IProgname);
        fprintf(stderr, "summary of frequently used options\n(for a more detailed listing see 'man cast'):\n");
	fprintf(stderr, "-help: output this menu\n");
        fprintf(stderr, "-n #: index numbers; warn if file adds > #%% numeric words: default is 50\n");
        fprintf(stderr, "-w #: warn if a file adds > # words to the index\n");
	fprintf(stderr, "-C: compress files with the new dictionary after building it\n");
	fprintf(stderr, "-E: build cast dictionary using existing compressed files only\n");
	fprintf(stderr, "-F: expect filenames on stdin (useful for pipelining)\n");
        fprintf(stderr, "-H 'dir': .glimpse-files should be in directory 'dir': default is '~'\n");
	fprintf(stderr, "\n");
	fprintf(stderr, "For questions about glimpse, please contact `%s'\n", GLIMPSE_EMAIL);
	exit(1);
}
#endif	/*!BUILDCAST*/
