cdrtools/mkisofs/hash.c

/* @(#)hash.c	1.30 16/12/13 joerg */
#include <schily/mconfig.h>
#ifndef lint
static	UConst char sccsid[] =
	"@(#)hash.c	1.30 16/12/13 joerg";

#endif
/*
 * File hash.c - generate hash tables for iso9660 filesystem.
 *
 * Written by Eric Youngdale (1993).
 *
 * Copyright 1993 Yggdrasil Computing, Incorporated
 * Copyright (c) 1999,2000-2016 J. Schilling
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* APPLE_HYB James Pearson j.pearson@ge.ucl.ac.uk 23/2/2000 */

/*
 * From jb@danware.dk:
 *
 * Cygwin fakes inodes by hashing file info, actual collisions observed!
 * This is documented in the cygwin source, look at winsup/cygwin/path.cc
 * and search for the word 'Hash'.  On NT, cygwin ORs together the
 * high and low 32 bits of the 64 bit genuine inode, look at fhandler.cc.
 *
 * Note:	Other operating systems which support the FAT filesystem may
 *		have the same problem because FAT does not use the inode
 *		concept.  For NTFS, genuine inode numbers exist, but they are
 *		64 bits and available only through an open file handle.
 *
 * The solution is the new options -no-cache-inodes/-cache-inodes that
 * allow to disable the mkisofs inode cache.
 */

/* DUPLICATES_ONCE Alex Kopylov cdrtools@bootcd.ru 19.06.2004 */

#include "mkisofs.h"
#include <schily/schily.h>
#include <schily/sha3.h>

#define	NR_HASH	(16*1024)

#define	HASH_FN(DEV, INO)	((DEV + INO  + (INO >> 8) + (INO << 16)) % NR_HASH)

static struct file_hash *hash_table[NR_HASH];

#ifdef	DUPLICATES_ONCE

#define	DIGEST_FAST_SIZE		65536
#define	UNIQUE_FILES_HASH_FN(SIZE)	((SIZE) % NR_HASH)

#ifndef	DIGEST_Init
#define	DIGEST_Init			SHA3_512_Init
#define	DIGEST_Update			SHA3_Update
#define	DIGEST_Final			SHA3_Final
#define	DIGEST_CTX			SHA3_CTX
#define	DIGEST_LENGTH			SHA3_512_DIGEST_LENGTH

#ifdef	SHORT_DIGEST
#undef	DIGEST_Init
#undef	DIGEST_LENGTH
#define	DIGEST_Init			SHA3_256_Init
#define	DIGEST_LENGTH			SHA3_256_DIGEST_LENGTH
#endif	/* SHORT_DIGEST */
#endif	/* DIGEST_Init */
#endif

#ifdef	HASH_DEBUG
EXPORT	void		debug_hash	__PR((void));
#endif
EXPORT	void		add_hash	__PR((struct directory_entry *spnt));
EXPORT	struct file_hash *find_hash	__PR((struct directory_entry *spnt));
EXPORT	void		flush_hash	__PR((void));
EXPORT	void		add_directory_hash __PR((dev_t dev, ino_t inode));
EXPORT	struct file_hash *find_directory_hash __PR((dev_t dev, ino_t inode));
LOCAL	unsigned int	name_hash	__PR((const char *name));
EXPORT	void		add_file_hash	__PR((struct directory_entry *de));
EXPORT	struct directory_entry *find_file_hash __PR((char *name));
LOCAL	BOOL		isoname_endsok	__PR((char *name));
EXPORT	int		delete_file_hash __PR((struct directory_entry *de));
EXPORT	void		flush_file_hash	__PR((void));
#ifdef	DUPLICATES_ONCE
LOCAL	struct directory_entry *compare_files __PR((
					struct directory_entry *spnt1,
					struct directory_entry *spnt2));
LOCAL	unsigned char	*DIGEST_File	__PR((char *name, size_t size));
#endif

#ifdef	HASH_DEBUG
EXPORT void
debug_hash()
{
	struct file_hash **p = hash_table;
	int	i;
	int	j = 0;
	struct file_hash *p2;
	int	k;
	int	maxlen = 0;
	int	minlen = 100000000;
	int	tot = 0;

	for (i = 0; i < NR_HASH; i++, p++) {
		if (*p == 0)
			j++;
		else {
			p2 = *p;
			k = 0;
			while (p2) {
				tot++;
				k++;
				p2 = p2->next;
			}
			if (k > maxlen)
				maxlen = k;
			if (k < minlen)
				minlen = k;
		}
	}
	error("Unbenutzt: %d von %d Eintr<74>gen maxlen %d minlen %d total %d optmittel %d\n",
		j, NR_HASH, maxlen, minlen, tot, tot/NR_HASH);
}
#endif

EXPORT void
add_hash(spnt)
	struct directory_entry	*spnt;
{
	struct file_hash *s_hash;
	unsigned int    hash_number = 0;

	if (spnt->size == 0 || spnt->starting_block == 0)
		if (spnt->size != 0 && spnt->starting_block == 0) {
			comerrno(EX_BAD,
			_("Non zero-length file '%s' assigned zero extent.\n"),
							spnt->name);
		};

#ifdef	DUPLICATES_ONCE
	if (!cache_inodes && !duplicates_once)
#else
	if (!cache_inodes)
#endif
		return;
	if (spnt->dev == UNCACHED_DEVICE &&
	    (spnt->inode == TABLE_INODE || spnt->inode == UNCACHED_INODE)) {
		return;
	}
#ifdef	DUPLICATES_ONCE
	if (cache_inodes)
#endif
		hash_number = HASH_FN((unsigned int) spnt->dev,
						(unsigned int) spnt->inode);
#ifdef	DUPLICATES_ONCE
	else if (duplicates_once &&
		    spnt->size && !(spnt->isorec.flags[0] & ISO_DIRECTORY))
		hash_number = UNIQUE_FILES_HASH_FN((unsigned int) spnt->size);
#endif

#if 0
	if (verbose > 1)
		fprintf(stderr, "%s ", spnt->name);
#endif
	s_hash = (struct file_hash *)e_malloc(sizeof (struct file_hash));
	s_hash->next = hash_table[hash_number];
	s_hash->inode = spnt->inode;
	s_hash->dev = spnt->dev;
	s_hash->nlink = 0;
	s_hash->starting_block = spnt->starting_block;
	s_hash->size = spnt->size;
#if	defined(SORTING) || defined(DUPLICATES_ONCE)
	s_hash->de = spnt;
#endif /* defined(SORTING) || defined(DUPLICATES_ONCE) */
	hash_table[hash_number] = s_hash;
}

EXPORT struct file_hash *
find_hash(spnt)
	struct directory_entry	*spnt;
{
	unsigned int    hash_number;
	struct file_hash *s_hash;

#ifdef	DUPLICATES_ONCE
	if (!cache_inodes && !duplicates_once)
#else
	if (!cache_inodes)
#endif
		return (NULL);
	if (spnt->dev == UNCACHED_DEVICE &&
	    (spnt->inode == TABLE_INODE || spnt->inode == UNCACHED_INODE))
		return (NULL);

#ifdef	DUPLICATES_ONCE
	if (cache_inodes) {
#endif
		hash_number = HASH_FN((unsigned int) spnt->dev,
				    (unsigned int) spnt->inode);
		s_hash = hash_table[hash_number];
		while (s_hash) {
			if (s_hash->inode == spnt->inode &&
			    s_hash->dev == spnt->dev)
				return (s_hash);
			s_hash = s_hash->next;
		}
#ifdef	DUPLICATES_ONCE
	} else if (duplicates_once &&
		    spnt->size && !(spnt->isorec.flags[0] & ISO_DIRECTORY)) {
		hash_number = UNIQUE_FILES_HASH_FN((unsigned int) spnt->size);
		s_hash = hash_table[hash_number];
		while (s_hash) {
			if (compare_files(spnt, s_hash->de))
				return (s_hash);
			s_hash = s_hash->next;
		}
	}
#endif
	return (NULL);
}

/*
 * based on flush_file_hash() below - needed as we want to re-use the
 * file hash table.
 */
EXPORT void
flush_hash()
{
	struct file_hash	*fh;
	struct file_hash	*fh1;
	int			i;

	for (i = 0; i < NR_HASH; i++) {
		fh = hash_table[i];
		while (fh) {
			fh1 = fh->next;
			free(fh);
			fh = fh1;
		}
		hash_table[i] = NULL;
	}
}

#ifdef	DUPLICATES_ONCE
LOCAL struct directory_entry *
compare_files(spnt1, spnt2)
	struct directory_entry	*spnt1;
	struct directory_entry	*spnt2;
{
	if (spnt1->size != spnt2->size)
		return (NULL);

	if (!spnt1->digest_fast)
		if (!(spnt1->digest_fast = DIGEST_File(spnt1->whole_name,
				(spnt1->size > DIGEST_FAST_SIZE) ?
				DIGEST_FAST_SIZE : spnt1->size)))
			return (NULL);

	if (spnt1->size <= DIGEST_FAST_SIZE)
		spnt1->digest_full = spnt1->digest_fast;

	if (!spnt2->digest_fast)
		if (!(spnt2->digest_fast = DIGEST_File(spnt2->whole_name,
				(spnt2->size > DIGEST_FAST_SIZE) ?
				DIGEST_FAST_SIZE : spnt2->size)))
			return (NULL);

	if (spnt2->size <= DIGEST_FAST_SIZE)
		spnt2->digest_full = spnt2->digest_fast;

	if (memcmp(spnt1->digest_fast, spnt2->digest_fast,
			DIGEST_LENGTH * sizeof (unsigned char)))
		return (NULL);

	if (!spnt1->digest_full)
		if (!(spnt1->digest_full = DIGEST_File(spnt1->whole_name,
						spnt1->size)))
			return (NULL);

	if (!spnt2->digest_full)
		if (!(spnt2->digest_full = DIGEST_File(spnt2->whole_name,
						spnt2->size)))
			return (NULL);

	if (memcmp(spnt1->digest_full, spnt2->digest_full,
			DIGEST_LENGTH * sizeof (unsigned char)))
		return (NULL);

	return (spnt2);
}

LOCAL unsigned char *
DIGEST_File(name, size)
	char		*name;
	size_t		size;
{
	DIGEST_CTX	digest_ctx;
	FILE		*infile;
	unsigned char	buf[32768];
	unsigned char	*digest_hash;
	size_t		cnt;

	DIGEST_Init(&digest_ctx);

	if ((infile = fopen(name, "rb")) == NULL)
		return (NULL);

	while (size) {
		cnt = (size > sizeof (buf)) ? sizeof (buf) : size;
		if ((cnt = fread(buf, 1, cnt, infile)) <= 0) {
			fclose(infile);
			return (NULL);
		}
		DIGEST_Update(&digest_ctx, buf, cnt);
		size -= cnt;
	}

	fclose(infile);

	digest_hash = e_malloc(sizeof (unsigned char) * DIGEST_LENGTH);
	DIGEST_Final(digest_hash, &digest_ctx);
	return (digest_hash);
}
#endif

static struct file_hash *directory_hash_table[NR_HASH];

#ifdef	PROTOTYPES
EXPORT void
add_directory_hash(dev_t dev, ino_t inode)
#else
EXPORT void
add_directory_hash(dev, inode)
	dev_t	dev;
	ino_t	inode;
#endif
{
	struct file_hash *s_hash;
	unsigned int    hash_number;

	if (!cache_inodes)
		return;
	if (dev == UNCACHED_DEVICE &&
	    (inode == TABLE_INODE || inode == UNCACHED_INODE))
		return;

	hash_number = HASH_FN((unsigned int) dev, (unsigned int) inode);

	s_hash = (struct file_hash *)e_malloc(sizeof (struct file_hash));
	s_hash->next = directory_hash_table[hash_number];
	s_hash->inode = inode;
	s_hash->dev = dev;
	s_hash->nlink = 0;
	directory_hash_table[hash_number] = s_hash;
}

#ifdef	PROTOTYPES
EXPORT struct file_hash *
find_directory_hash(dev_t dev, ino_t inode)
#else
EXPORT struct file_hash *
find_directory_hash(dev, inode)
	dev_t	dev;
	ino_t	inode;
#endif
{
	unsigned int    hash_number;
	struct file_hash *spnt;

	if (!cache_inodes)
		return (NULL);
	if (dev == UNCACHED_DEVICE &&
	    (inode == TABLE_INODE || inode == UNCACHED_INODE))
		return (NULL);

	hash_number = HASH_FN((unsigned int) dev, (unsigned int) inode);
	spnt = directory_hash_table[hash_number];
	while (spnt) {
		if (spnt->inode == inode && spnt->dev == dev)
			return (spnt);
		spnt = spnt->next;
	};
	return (NULL);
}

struct name_hash {
	struct name_hash *next;
	struct directory_entry *de;
	int	sum;
};

#define	NR_NAME_HASH	128

static struct name_hash *name_hash_table[NR_NAME_HASH] = {0, };

/*
 * Find the hash bucket for this name.
 */
LOCAL unsigned int
name_hash(name)
	const char	*name;
{
	unsigned int	hash = 0;
	const char	*p;

	p = name;

	while (*p) {
		/*
		 * Don't hash the  iso9660 version number.
		 * This way we can detect duplicates in cases where we have
		 * directories (i.e. foo) and non-directories (i.e. foo;1).
		 */
		if (*p == ';') {
			break;
		}
		hash = (hash << 15) + (hash << 3) + (hash >> 3) + (*p++ & 0xFF);
	}
	return (hash % NR_NAME_HASH);
}

EXPORT void
add_file_hash(de)
	struct directory_entry	*de;
{
	struct name_hash	*new;
	int			hash;
	Uchar			*p;
	int			sum = 0;

	new = (struct name_hash *)e_malloc(sizeof (struct name_hash));
	new->de = de;
	new->next = NULL;
	for (p = (Uchar *)de->isorec.name; *p; p++) {
		if (*p == ';')
			break;
		sum += *p & 0xFF;
	}
	new->sum = sum;
	hash = name_hash(de->isorec.name);

	/* Now insert into the hash table */
	new->next = name_hash_table[hash];
	name_hash_table[hash] = new;
}

EXPORT struct directory_entry *
find_file_hash(name)
	register char			*name;
{
	register char			*p1;
	register char			*p2;
	register struct name_hash	*nh;
	register int			sum = 0;

	if (debug > 1)
		error("find_hash('%s')\n", name);

	for (p1 = name; *p1; p1++) {
		if (*p1 == ';')
			break;
		sum += *p1 & 0xFF;
	}

	for (nh = name_hash_table[name_hash(name)]; nh; nh = nh->next) {
		if (nh->sum != sum)
			continue;

		p1 = name;
		p2 = nh->de->isorec.name;
		if (debug > 1)
			error(_("Checking name '%s' isorec.name '%s'\n"), p1, p2);

		/* Look for end of string, or a mismatch. */
		while (1 == 1) {
			if ((*p1 == '\0' || *p1 == ';') ||
			    (*p2 == '\0' || *p2 == ';') ||
			    (*p1 != *p2)) {
				break;
			}
			p1++;
			p2++;
		}
		if (!isoname_endsok(p1) || !isoname_endsok(p2)) {
			if (debug > 1) {
				if (!isoname_endsok(p1))
					error(_("'%s' does NOT END OK\n"), p1);
				if (!isoname_endsok(p2))
					error(_("'%s' does NOT END OK\n"), p2);
			}
			/*
			 * If one file does not end with a valid version number
			 * and the other name ends here, we found a miss match.
			 */
			if (*p1 == '\0' || *p2 == '\0')
				continue;

			if (*p1 == ';' && *p2 == ';') {
				p1++;
				p2++;
				continue;
			}
		}

		/*
		 * If we are at the end of both strings, then we have a match.
		 */
		if ((*p1 == '\0' || *p1 == ';') &&
		    (*p2 == '\0' || *p2 == ';')) {
			return (nh->de);
		}
	}
	return (NULL);
}

/*
 * The macro 'eo' is just an idea on how one might speed up isoname_endsok()
 */
#define	eo(p)	(((p)[0] == '\0') || \
		((p)[0] == ';' && (p)[1] == '1' && (p)[2] == '\0') || \
		isoname_endsok(p))

LOCAL BOOL
isoname_endsok(name)
	char	*name;
{
	int	i;
	char	*p;

	if (*name == '\0')
		return (TRUE);
	if (*name != ';')
		return (FALSE);

	for (p = ++name, i = 0; *p && i < 5; p++, i++) {
		if (*p < '0' || *p > '9')
			return (FALSE);
	}
	i = atoi(name);
	if (i < 1 || i > 32767)
		return (FALSE);
	return (TRUE);
}

EXPORT int
delete_file_hash(de)
	struct directory_entry	*de;
{
	struct name_hash	*nh;
	struct name_hash	*prev;
	int			hash;

	prev = NULL;
	hash = name_hash(de->isorec.name);
	for (nh = name_hash_table[hash]; nh; nh = nh->next) {
		if (nh->de == de)
			break;
		prev = nh;
	}
	if (!nh)
		return (1);
	if (!prev)
		name_hash_table[hash] = nh->next;
	else
		prev->next = nh->next;
	free(nh);
	return (0);
}

EXPORT void
flush_file_hash()
{
	struct name_hash	*nh;
	struct name_hash	*nh1;
	int			i;

	for (i = 0; i < NR_NAME_HASH; i++) {
		nh = name_hash_table[i];
		while (nh) {
			nh1 = nh->next;
			free(nh);
			nh = nh1;
		}
		name_hash_table[i] = NULL;

	}
}