/*
 *  Copyright (C) 2010 Lawrence Livermore National Security, LLC.
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
 *  UCRL-CODE-235197
 *
 *  This file is part of the SPL, Solaris Porting Layer.
 *  For details, see <http://zfsonlinux.org/>.
 *
 *  The SPL is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the
 *  Free Software Foundation; either version 2 of the License, or (at your
 *  option) any later version.
 *
 *  The SPL is distributed in the hope that it will be useful, but WITHOUT
 *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
 *
 *
 *  Solaris Porting Layer (SPL) Thread Specific Data Implementation.
 *
 *  Thread specific data has implemented using a hash table, this avoids
 *  the need to add a member to the task structure and allows maximum
 *  portability between kernels.  This implementation has been optimized
 *  to keep the tsd_set() and tsd_get() times as small as possible.
 *
 *  The majority of the entries in the hash table are for specific tsd
 *  entries.  These entries are hashed by the product of their key and
 *  pid because by design the key and pid are guaranteed to be unique.
 *  Their product also has the desirable properly that it will be uniformly
 *  distributed over the hash bins providing neither the pid nor key is zero.
 *  Under linux the zero pid is always the init process and thus won't be
 *  used, and this implementation is careful to never to assign a zero key.
 *  By default the hash table is sized to 512 bins which is expected to
 *  be sufficient for light to moderate usage of thread specific data.
 *
 *  The hash table contains two additional type of entries.  They first
 *  type is entry is called a 'key' entry and it is added to the hash during
 *  tsd_create().  It is used to store the address of the destructor function
 *  and it is used as an anchor point.  All tsd entries which use the same
 *  key will be linked to this entry.  This is used during tsd_destory() to
 *  quickly call the destructor function for all tsd associated with the key.
 *  The 'key' entry may be looked up with tsd_hash_search() by passing the
 *  key you wish to lookup and DTOR_PID constant as the pid.
 *
 *  The second type of entry is called a 'pid' entry and it is added to the
 *  hash the first time a process set a key.  The 'pid' entry is also used
 *  as an anchor and all tsd for the process will be linked to it.  This
 *  list is using during tsd_exit() to ensure all registered destructors
 *  are run for the process.  The 'pid' entry may be looked up with
 *  tsd_hash_search() by passing the PID_KEY constant as the key, and
 *  the process pid.  Note that tsd_exit() is called by thread_exit()
 *  so if your using the Solaris thread API you should not need to call
 *  tsd_exit() directly.
 *
 */

#include <sys/kmem.h>
#include <sys/thread.h>
#include <sys/tsd.h>
#include <linux/hash.h>

typedef struct tsd_hash_bin {
	spinlock_t		hb_lock;
	struct hlist_head	hb_head;
} tsd_hash_bin_t;

typedef struct tsd_hash_table {
	spinlock_t		ht_lock;
	uint_t			ht_bits;
	uint_t			ht_key;
	tsd_hash_bin_t		*ht_bins;
} tsd_hash_table_t;

typedef struct tsd_hash_entry {
	uint_t			he_key;
	pid_t			he_pid;
	dtor_func_t		he_dtor;
	void			*he_value;
	struct hlist_node	he_list;
	struct list_head	he_key_list;
	struct list_head	he_pid_list;
} tsd_hash_entry_t;

static tsd_hash_table_t *tsd_hash_table = NULL;


/*
 * tsd_hash_search - searches hash table for tsd_hash_entry
 * @table: hash table
 * @key: search key
 * @pid: search pid
 */
static tsd_hash_entry_t *
tsd_hash_search(tsd_hash_table_t *table, uint_t key, pid_t pid)
{
	struct hlist_node *node;
	tsd_hash_entry_t *entry;
	tsd_hash_bin_t *bin;
	ulong_t hash;

	hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits);
	bin = &table->ht_bins[hash];
	spin_lock(&bin->hb_lock);
	hlist_for_each(node, &bin->hb_head) {
		entry = list_entry(node, tsd_hash_entry_t, he_list);
		if ((entry->he_key == key) && (entry->he_pid == pid)) {
			spin_unlock(&bin->hb_lock);
			return (entry);
		}
	}

	spin_unlock(&bin->hb_lock);
	return (NULL);
}

/*
 * tsd_hash_dtor - call the destructor and free all entries on the list
 * @work: list of hash entries
 *
 * For a list of entries which have all already been removed from the
 * hash call their registered destructor then free the associated memory.
 */
static void
tsd_hash_dtor(struct hlist_head *work)
{
	tsd_hash_entry_t *entry;

	while (!hlist_empty(work)) {
		entry = hlist_entry(work->first, tsd_hash_entry_t, he_list);
		hlist_del(&entry->he_list);

		if (entry->he_dtor && entry->he_pid != DTOR_PID)
			entry->he_dtor(entry->he_value);

		kmem_free(entry, sizeof (tsd_hash_entry_t));
	}
}

/*
 * tsd_hash_add - adds an entry to hash table
 * @table: hash table
 * @key: search key
 * @pid: search pid
 *
 * The caller is responsible for ensuring the unique key/pid do not
 * already exist in the hash table.  This possible because all entries
 * are thread specific thus a concurrent thread will never attempt to
 * add this key/pid.  Because multiple bins must be checked to add
 * links to the dtor and pid entries the entire table is locked.
 */
static int
tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value)
{
	tsd_hash_entry_t *entry, *dtor_entry, *pid_entry;
	tsd_hash_bin_t *bin;
	ulong_t hash;
	int rc = 0;

	ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL);

	/* New entry allocate structure, set value, and add to hash */
	entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
	if (entry == NULL)
		return (ENOMEM);

	entry->he_key = key;
	entry->he_pid = pid;
	entry->he_value = value;
	INIT_HLIST_NODE(&entry->he_list);
	INIT_LIST_HEAD(&entry->he_key_list);
	INIT_LIST_HEAD(&entry->he_pid_list);

	spin_lock(&table->ht_lock);

	/* Destructor entry must exist for all valid keys */
	dtor_entry = tsd_hash_search(table, entry->he_key, DTOR_PID);
	ASSERT3P(dtor_entry, !=, NULL);
	entry->he_dtor = dtor_entry->he_dtor;

	/* Process entry must exist for all valid processes */
	pid_entry = tsd_hash_search(table, PID_KEY, entry->he_pid);
	ASSERT3P(pid_entry, !=, NULL);

	hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits);
	bin = &table->ht_bins[hash];
	spin_lock(&bin->hb_lock);

	/* Add to the hash, key, and pid lists */
	hlist_add_head(&entry->he_list, &bin->hb_head);
	list_add(&entry->he_key_list, &dtor_entry->he_key_list);
	list_add(&entry->he_pid_list, &pid_entry->he_pid_list);

	spin_unlock(&bin->hb_lock);
	spin_unlock(&table->ht_lock);

	return (rc);
}

/*
 * tsd_hash_add_key - adds a destructor entry to the hash table
 * @table: hash table
 * @keyp: search key
 * @dtor: key destructor
 *
 * For every unique key there is a single entry in the hash which is used
 * as anchor.  All other thread specific entries for this key are linked
 * to this anchor via the 'he_key_list' list head.  On return they keyp
 * will be set to the next available key for the hash table.
 */
static int
tsd_hash_add_key(tsd_hash_table_t *table, uint_t *keyp, dtor_func_t dtor)
{
	tsd_hash_entry_t *tmp_entry, *entry;
	tsd_hash_bin_t *bin;
	ulong_t hash;
	int keys_checked = 0;

	ASSERT3P(table, !=, NULL);

	/* Allocate entry to be used as a destructor for this key */
	entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
	if (entry == NULL)
		return (ENOMEM);

	/* Determine next available key value */
	spin_lock(&table->ht_lock);
	do {
		/* Limited to TSD_KEYS_MAX concurrent unique keys */
		if (table->ht_key++ > TSD_KEYS_MAX)
			table->ht_key = 1;

		/* Ensure failure when all TSD_KEYS_MAX keys are in use */
		if (keys_checked++ >= TSD_KEYS_MAX) {
			spin_unlock(&table->ht_lock);
			return (ENOENT);
		}

		tmp_entry = tsd_hash_search(table, table->ht_key, DTOR_PID);
	} while (tmp_entry);

	/* Add destructor entry in to hash table */
	entry->he_key = *keyp = table->ht_key;
	entry->he_pid = DTOR_PID;
	entry->he_dtor = dtor;
	entry->he_value = NULL;
	INIT_HLIST_NODE(&entry->he_list);
	INIT_LIST_HEAD(&entry->he_key_list);
	INIT_LIST_HEAD(&entry->he_pid_list);

	hash = hash_long((ulong_t)*keyp * (ulong_t)DTOR_PID, table->ht_bits);
	bin = &table->ht_bins[hash];
	spin_lock(&bin->hb_lock);

	hlist_add_head(&entry->he_list, &bin->hb_head);

	spin_unlock(&bin->hb_lock);
	spin_unlock(&table->ht_lock);

	return (0);
}

/*
 * tsd_hash_add_pid - adds a process entry to the hash table
 * @table: hash table
 * @pid: search pid
 *
 * For every process these is a single entry in the hash which is used
 * as anchor.  All other thread specific entries for this process are
 * linked to this anchor via the 'he_pid_list' list head.
 */
static int
tsd_hash_add_pid(tsd_hash_table_t *table, pid_t pid)
{
	tsd_hash_entry_t *entry;
	tsd_hash_bin_t *bin;
	ulong_t hash;

	/* Allocate entry to be used as the process reference */
	entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
	if (entry == NULL)
		return (ENOMEM);

	spin_lock(&table->ht_lock);
	entry->he_key = PID_KEY;
	entry->he_pid = pid;
	entry->he_dtor = NULL;
	entry->he_value = NULL;
	INIT_HLIST_NODE(&entry->he_list);
	INIT_LIST_HEAD(&entry->he_key_list);
	INIT_LIST_HEAD(&entry->he_pid_list);

	hash = hash_long((ulong_t)PID_KEY * (ulong_t)pid, table->ht_bits);
	bin = &table->ht_bins[hash];
	spin_lock(&bin->hb_lock);

	hlist_add_head(&entry->he_list, &bin->hb_head);

	spin_unlock(&bin->hb_lock);
	spin_unlock(&table->ht_lock);

	return (0);
}

/*
 * tsd_hash_del - delete an entry from hash table, key, and pid lists
 * @table: hash table
 * @key: search key
 * @pid: search pid
 */
static void
tsd_hash_del(tsd_hash_table_t *table, tsd_hash_entry_t *entry)
{
	hlist_del(&entry->he_list);
	list_del_init(&entry->he_key_list);
	list_del_init(&entry->he_pid_list);
}

/*
 * tsd_hash_table_init - allocate a hash table
 * @bits: hash table size
 *
 * A hash table with 2^bits bins will be created, it may not be resized
 * after the fact and must be free'd with tsd_hash_table_fini().
 */
static tsd_hash_table_t *
tsd_hash_table_init(uint_t bits)
{
	tsd_hash_table_t *table;
	int hash, size = (1 << bits);

	table = kmem_zalloc(sizeof (tsd_hash_table_t), KM_SLEEP);
	if (table == NULL)
		return (NULL);

	table->ht_bins = kmem_zalloc(sizeof (tsd_hash_bin_t) * size, KM_SLEEP);
	if (table->ht_bins == NULL) {
		kmem_free(table, sizeof (tsd_hash_table_t));
		return (NULL);
	}

	for (hash = 0; hash < size; hash++) {
		spin_lock_init(&table->ht_bins[hash].hb_lock);
		INIT_HLIST_HEAD(&table->ht_bins[hash].hb_head);
	}

	spin_lock_init(&table->ht_lock);
	table->ht_bits = bits;
	table->ht_key = 1;

	return (table);
}

/*
 * tsd_hash_table_fini - free a hash table
 * @table: hash table
 *
 * Free a hash table allocated by tsd_hash_table_init().  If the hash
 * table is not empty this function will call the proper destructor for
 * all remaining entries before freeing the memory used by those entries.
 */
static void
tsd_hash_table_fini(tsd_hash_table_t *table)
{
	HLIST_HEAD(work);
	tsd_hash_bin_t *bin;
	tsd_hash_entry_t *entry;
	int size, i;

	ASSERT3P(table, !=, NULL);
	spin_lock(&table->ht_lock);
	for (i = 0, size = (1 << table->ht_bits); i < size; i++) {
		bin = &table->ht_bins[i];
		spin_lock(&bin->hb_lock);
		while (!hlist_empty(&bin->hb_head)) {
			entry = hlist_entry(bin->hb_head.first,
			    tsd_hash_entry_t, he_list);
			tsd_hash_del(table, entry);
			hlist_add_head(&entry->he_list, &work);
		}
		spin_unlock(&bin->hb_lock);
	}
	spin_unlock(&table->ht_lock);

	tsd_hash_dtor(&work);
	kmem_free(table->ht_bins, sizeof (tsd_hash_bin_t)*(1<<table->ht_bits));
	kmem_free(table, sizeof (tsd_hash_table_t));
}

/*
 * tsd_remove_entry - remove a tsd entry for this thread
 * @entry: entry to remove
 *
 * Remove the thread specific data @entry for this thread.
 * If this is the last entry for this thread, also remove the PID entry.
 */
static void
tsd_remove_entry(tsd_hash_entry_t *entry)
{
	HLIST_HEAD(work);
	tsd_hash_table_t *table;
	tsd_hash_entry_t *pid_entry;
	tsd_hash_bin_t *pid_entry_bin, *entry_bin;
	ulong_t hash;

	table = tsd_hash_table;
	ASSERT3P(table, !=, NULL);
	ASSERT3P(entry, !=, NULL);

	spin_lock(&table->ht_lock);

	hash = hash_long((ulong_t)entry->he_key *
	    (ulong_t)entry->he_pid, table->ht_bits);
	entry_bin = &table->ht_bins[hash];

	/* save the possible pid_entry */
	pid_entry = list_entry(entry->he_pid_list.next, tsd_hash_entry_t,
	    he_pid_list);

	/* remove entry */
	spin_lock(&entry_bin->hb_lock);
	tsd_hash_del(table, entry);
	hlist_add_head(&entry->he_list, &work);
	spin_unlock(&entry_bin->hb_lock);

	/* if pid_entry is indeed pid_entry, then remove it if it's empty */
	if (pid_entry->he_key == PID_KEY &&
	    list_empty(&pid_entry->he_pid_list)) {
		hash = hash_long((ulong_t)pid_entry->he_key *
		    (ulong_t)pid_entry->he_pid, table->ht_bits);
		pid_entry_bin = &table->ht_bins[hash];

		spin_lock(&pid_entry_bin->hb_lock);
		tsd_hash_del(table, pid_entry);
		hlist_add_head(&pid_entry->he_list, &work);
		spin_unlock(&pid_entry_bin->hb_lock);
	}

	spin_unlock(&table->ht_lock);

	tsd_hash_dtor(&work);
}

/*
 * tsd_set - set thread specific data
 * @key: lookup key
 * @value: value to set
 *
 * Caller must prevent racing tsd_create() or tsd_destroy(), protected
 * from racing tsd_get() or tsd_set() because it is thread specific.
 * This function has been optimized to be fast for the update case.
 * When setting the tsd initially it will be slower due to additional
 * required locking and potential memory allocations.
 */
int
tsd_set(uint_t key, void *value)
{
	tsd_hash_table_t *table;
	tsd_hash_entry_t *entry;
	pid_t pid;
	int rc;
	/* mark remove if value is NULL */
	boolean_t remove = (value == NULL);

	table = tsd_hash_table;
	pid = curthread->pid;
	ASSERT3P(table, !=, NULL);

	if ((key == 0) || (key > TSD_KEYS_MAX))
		return (EINVAL);

	/* Entry already exists in hash table update value */
	entry = tsd_hash_search(table, key, pid);
	if (entry) {
		entry->he_value = value;
		/* remove the entry */
		if (remove)
			tsd_remove_entry(entry);
		return (0);
	}

	/* don't create entry if value is NULL */
	if (remove)
		return (0);

	/* Add a process entry to the hash if not yet exists */
	entry = tsd_hash_search(table, PID_KEY, pid);
	if (entry == NULL) {
		rc = tsd_hash_add_pid(table, pid);
		if (rc)
			return (rc);
	}

	rc = tsd_hash_add(table, key, pid, value);
	return (rc);
}
EXPORT_SYMBOL(tsd_set);

/*
 * tsd_get - get thread specific data
 * @key: lookup key
 *
 * Caller must prevent racing tsd_create() or tsd_destroy().  This
 * implementation is designed to be fast and scalable, it does not
 * lock the entire table only a single hash bin.
 */
void *
tsd_get(uint_t key)
{
	tsd_hash_entry_t *entry;

	ASSERT3P(tsd_hash_table, !=, NULL);

	if ((key == 0) || (key > TSD_KEYS_MAX))
		return (NULL);

	entry = tsd_hash_search(tsd_hash_table, key, curthread->pid);
	if (entry == NULL)
		return (NULL);

	return (entry->he_value);
}
EXPORT_SYMBOL(tsd_get);

/*
 * tsd_get_by_thread - get thread specific data for specified thread
 * @key: lookup key
 * @thread: thread to lookup
 *
 * Caller must prevent racing tsd_create() or tsd_destroy().  This
 * implementation is designed to be fast and scalable, it does not
 * lock the entire table only a single hash bin.
 */
void *
tsd_get_by_thread(uint_t key, kthread_t *thread)
{
	tsd_hash_entry_t *entry;

	ASSERT3P(tsd_hash_table, !=, NULL);

	if ((key == 0) || (key > TSD_KEYS_MAX))
		return (NULL);

	entry = tsd_hash_search(tsd_hash_table, key, thread->pid);
	if (entry == NULL)
		return (NULL);

	return (entry->he_value);
}
EXPORT_SYMBOL(tsd_get_by_thread);

/*
 * tsd_create - create thread specific data key
 * @keyp: lookup key address
 * @dtor: destructor called during tsd_destroy() or tsd_exit()
 *
 * Provided key must be set to 0 or it assumed to be already in use.
 * The dtor is allowed to be NULL in which case no additional cleanup
 * for the data is performed during tsd_destroy() or tsd_exit().
 *
 * Caller must prevent racing tsd_set() or tsd_get(), this function is
 * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
 */
void
tsd_create(uint_t *keyp, dtor_func_t dtor)
{
	ASSERT3P(keyp, !=, NULL);
	if (*keyp)
		return;

	(void) tsd_hash_add_key(tsd_hash_table, keyp, dtor);
}
EXPORT_SYMBOL(tsd_create);

/*
 * tsd_destroy - destroy thread specific data
 * @keyp: lookup key address
 *
 * Destroys the thread specific data on all threads which use this key.
 *
 * Caller must prevent racing tsd_set() or tsd_get(), this function is
 * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
 */
void
tsd_destroy(uint_t *keyp)
{
	HLIST_HEAD(work);
	tsd_hash_table_t *table;
	tsd_hash_entry_t *dtor_entry, *entry;
	tsd_hash_bin_t *dtor_entry_bin, *entry_bin;
	ulong_t hash;

	table = tsd_hash_table;
	ASSERT3P(table, !=, NULL);

	spin_lock(&table->ht_lock);
	dtor_entry = tsd_hash_search(table, *keyp, DTOR_PID);
	if (dtor_entry == NULL) {
		spin_unlock(&table->ht_lock);
		return;
	}

	/*
	 * All threads which use this key must be linked off of the
	 * DTOR_PID entry.  They are removed from the hash table and
	 * linked in to a private working list to be destroyed.
	 */
	while (!list_empty(&dtor_entry->he_key_list)) {
		entry = list_entry(dtor_entry->he_key_list.next,
		    tsd_hash_entry_t, he_key_list);
		ASSERT3U(dtor_entry->he_key, ==, entry->he_key);
		ASSERT3P(dtor_entry->he_dtor, ==, entry->he_dtor);

		hash = hash_long((ulong_t)entry->he_key *
		    (ulong_t)entry->he_pid, table->ht_bits);
		entry_bin = &table->ht_bins[hash];

		spin_lock(&entry_bin->hb_lock);
		tsd_hash_del(table, entry);
		hlist_add_head(&entry->he_list, &work);
		spin_unlock(&entry_bin->hb_lock);
	}

	hash = hash_long((ulong_t)dtor_entry->he_key *
	    (ulong_t)dtor_entry->he_pid, table->ht_bits);
	dtor_entry_bin = &table->ht_bins[hash];

	spin_lock(&dtor_entry_bin->hb_lock);
	tsd_hash_del(table, dtor_entry);
	hlist_add_head(&dtor_entry->he_list, &work);
	spin_unlock(&dtor_entry_bin->hb_lock);
	spin_unlock(&table->ht_lock);

	tsd_hash_dtor(&work);
	*keyp = 0;
}
EXPORT_SYMBOL(tsd_destroy);

/*
 * tsd_exit - destroys all thread specific data for this thread
 *
 * Destroys all the thread specific data for this thread.
 *
 * Caller must prevent racing tsd_set() or tsd_get(), this function is
 * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
 */
void
tsd_exit(void)
{
	HLIST_HEAD(work);
	tsd_hash_table_t *table;
	tsd_hash_entry_t *pid_entry, *entry;
	tsd_hash_bin_t *pid_entry_bin, *entry_bin;
	ulong_t hash;

	table = tsd_hash_table;
	ASSERT3P(table, !=, NULL);

	spin_lock(&table->ht_lock);
	pid_entry = tsd_hash_search(table, PID_KEY, curthread->pid);
	if (pid_entry == NULL) {
		spin_unlock(&table->ht_lock);
		return;
	}

	/*
	 * All keys associated with this pid must be linked off of the
	 * PID_KEY entry.  They are removed from the hash table and
	 * linked in to a private working list to be destroyed.
	 */

	while (!list_empty(&pid_entry->he_pid_list)) {
		entry = list_entry(pid_entry->he_pid_list.next,
		    tsd_hash_entry_t, he_pid_list);
		ASSERT3U(pid_entry->he_pid, ==, entry->he_pid);

		hash = hash_long((ulong_t)entry->he_key *
		    (ulong_t)entry->he_pid, table->ht_bits);
		entry_bin = &table->ht_bins[hash];

		spin_lock(&entry_bin->hb_lock);
		tsd_hash_del(table, entry);
		hlist_add_head(&entry->he_list, &work);
		spin_unlock(&entry_bin->hb_lock);
	}

	hash = hash_long((ulong_t)pid_entry->he_key *
	    (ulong_t)pid_entry->he_pid, table->ht_bits);
	pid_entry_bin = &table->ht_bins[hash];

	spin_lock(&pid_entry_bin->hb_lock);
	tsd_hash_del(table, pid_entry);
	hlist_add_head(&pid_entry->he_list, &work);
	spin_unlock(&pid_entry_bin->hb_lock);
	spin_unlock(&table->ht_lock);

	tsd_hash_dtor(&work);
}
EXPORT_SYMBOL(tsd_exit);

int
spl_tsd_init(void)
{
	tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT);
	if (tsd_hash_table == NULL)
		return (1);

	return (0);
}

void
spl_tsd_fini(void)
{
	tsd_hash_table_fini(tsd_hash_table);
	tsd_hash_table = NULL;
}