
/*
 *         PVM version 3.3:  Parallel Virtual Machine System
 *               University of Tennessee, Knoxville TN.
 *           Oak Ridge National Laboratory, Oak Ridge TN.
 *                   Emory University, Atlanta GA.
 *      Authors:  A. L. Beguelin, J. J. Dongarra, G. A. Geist,
 *    W. C. Jiang, R. J. Manchek, B. K. Moore, and V. S. Sunderam
 *                   (C) 1992 All Rights Reserved
 *
 *                              NOTICE
 *
 * Permission to use, copy, modify, and distribute this software and
 * its documentation for any purpose and without fee is hereby granted
 * provided that the above copyright notice appear in all copies and
 * that both the copyright notice and this permission notice appear in
 * supporting documentation.
 *
 * Neither the Institutions (Emory University, Oak Ridge National
 * Laboratory, and University of Tennessee) nor the Authors make any
 * representations about the suitability of this software for any
 * purpose.  This software is provided ``as is'' without express or
 * implied warranty.
 *
 * PVM version 3 was funded in part by the U.S. Department of Energy,
 * the National Science Foundation and the State of Tennessee.
 */

/*
 *	pvmd.c
 *
 *	Mr. pvm daemon.
 *
$Log: pvmd.c,v $
 * Revision 1.41  1996/05/13  21:38:44  manchek
 * added errno checks for ENETDOWN, etc. if sendto fails, so we can
 * tolerate short network outages.
 * change runstate to PVMDISTASK during fork in case we get signalled
 * right then, exit instead of cleaning up in evilsig and catch.
 * reset dead fifo in beprime so pvmd' doesn't try to clean up after SIGCHLDs
 *
 * Revision 1.40  1995/11/02  16:29:24  manchek
 * added -t flag for test mode.
 * put back save under packet header in netoutput.
 * refragment in pkt_to_host now handles message header correctly
 *
 * Revision 1.39  1995/09/05  19:22:07  manchek
 * forgot ifdef for SP2MPI
 *
 * Revision 1.38  1995/07/28  20:52:01  manchek
 * missed changing src to pk_src in loclinpkt
 *
 * Revision 1.37  1995/07/28  16:40:59  manchek
 * wrap HASERRORVARS around errno declarations
 *
 * Revision 1.36  1995/07/24  19:52:02  manchek
 * message header no longer part of packet data, goes in pkt struct.
 * socket drivers in {locl,net}{in,out}put must strip and reconstitute headers.
 * no longer need to replicate first fragment of message to send,
 * or to save-under.
 * cleaned up line between loclinput and loclinpkt.
 *
 * Revision 1.35  1995/07/19  21:26:57  manchek
 * use new function pvmnametag instead of [dts]mname
 *
 * Revision 1.34  1995/07/18  17:02:03  manchek
 * added code to generate and check crc on each message (MCHECKSUM)
 *
 * Revision 1.33  1995/07/11  18:56:00  manchek
 * main prints PVMSOCK instead of master_config (after mpp_init)
 *
 * Revision 1.32  1995/07/05  16:20:39  manchek
 * work calls mpp_dredge for zombies if task with zero tid closes socket
 * (possibly a shared memory task exiting)
 *
 * Revision 1.31  1995/07/03  19:16:24  manchek
 * removed POWER4 ifdefs and misc. schmutz
 *
 * Revision 1.30  1995/06/28  15:27:33  manchek
 * pvmbailout doesn't set global bailing_out
 *
 * Revision 1.29  1995/06/16  16:28:35  manchek
 * (CSPP) CINDEX macro defined both in pvmd.c and system include file.
 * can undef CINDEX before we define it for us
 *
 * Revision 1.28  1995/06/02  17:51:44  manchek
 * added code to balance spawn (forks) on CSPP
 *
 * Revision 1.27  1995/05/30  17:46:59  manchek
 * Added ifdefs for SP2MPI arch
 *
 * Revision 1.26  1995/05/17  16:31:57  manchek
 * changed global mytid to pvmmytid.
 * pvmbailout sets global bailing_out (used by shared memory code).
 * use PVMDDEBUG envar to set debugmask at startup.
 * added new debug classes.
 * use FDSETISINT.
 * on LINUX systems, check sendto for ENOMEM.
 *
 * Revision 1.25  1995/02/06  22:40:11  manchek
 * shared memory ports call mpp_setmtu before slave_config
 *
 * Revision 1.24  1995/02/06  18:52:24  manchek
 * added debugging prints for when main select in work fails (solaris)
 *
 * Revision 1.23  1995/02/06  05:01:28  manchek
 * hmm
 *
 * Revision 1.22  1995/02/03  16:45:27  manchek
 * touch up reap - define rus as int if we don't think struct rusage exists
 *
 * Revision 1.21  1995/02/01  21:31:23  manchek
 * added clear_opq_of, called when host is deleted from table or pvmd' exits
 *
 * Revision 1.20  1994/12/20  16:40:35  manchek
 * use O_NONBLOCK for RS6K
 *
 * Revision 1.19  1994/11/08  19:05:07  manchek
 * mpp fix?
 *
 * Revision 1.18  1994/11/08  15:30:51  manchek
 * shared memory cleanup
 *
 * Revision 1.17  1994/10/15  19:27:02  manchek
 * make wrk_fds_init(), use instead of FD_ZERO.
 * don't send FIN|ACK to ourself in bailout.
 * don't clean up task until SIGCHLD if TF_FORKD set.
 * check newhosts when deleting host.
 * cast message tags for comparison as integers.
 * in beprime() call task_init instead of trying to clean up
 *
 * Revision 1.16  1994/09/02  15:48:17  manchek
 * added UXPM ifdef to parallel SUN4SOL2
 *
 * Revision 1.15  1994/09/02  15:27:55  manchek
 * forgot to inc refcount of nth fragment in sendmessage
 *
 * Revision 1.14  1994/07/18  19:21:51  manchek
 * added PDMWAITC.
 * fix to call write() with max 4096 length for RS6K
 *
 * Revision 1.13  1994/06/30  21:36:56  manchek
 * don't check remote sockaddr in netinput() on LINUX
 *
 * Revision 1.12  1994/06/04  21:45:10  manchek
 * added unix domain sockets
 *
 * Revision 1.11  1994/06/03  20:38:22  manchek
 * version 3.3.0
 *
 * Revision 1.10  1993/12/20  15:39:28  manchek
 * patch 6 from wcj
 *
 * Revision 1.9  1993/10/25  20:51:11  manchek
 * make sure pvmd doesn't use 0..2 for sockets, etc. - open /dev/null.
 * added code to change process group/disassoc. from tty (TTYDIS).
 * ping other pvmds also when run state is PVMDHTUPD
 *
 * Revision 1.8  1993/10/12  14:18:37  manchek
 * fixed bug in locloutput() - hung if write() returned 0
 *
 * Revision 1.7  1993/10/04  20:27:42  manchek
 * renamed useruid to pvm_useruid for compat with libpvm
 *
 * Revision 1.6  1993/10/04  19:17:45  manchek
 * on Solaris, sendto() can return ECHILD.  Hahahahahaha!!!
 *
 * Revision 1.5  1993/10/04  19:12:25  manchek
 * hd_txseq wasn't wrapped properly with NEXTSEQNUM
 *
 * Revision 1.4  1993/09/23  20:36:19  manchek
 * fixed broken mca lookup
 *
 * Revision 1.3  1993/09/22  19:14:47  manchek
 * added network resend statistic.
 * removed redundant code in netinpkt() where it finds mca
 *
 * Revision 1.2  1993/09/16  21:45:32  manchek
 * replaced reap() - now uses SYSVSIGNAL and NOWAIT3 macros
 *
 * Revision 1.1  1993/08/30  23:26:50  manchek
 * Initial revision
 *
 */


#include <sys/param.h>
#ifdef IMA_TITN
#include <bsd/sys/types.h>
#else
#include <sys/types.h>
#include <sys/ioctl.h>
#endif
#include <sys/time.h>
#include <sys/wait.h>
#ifndef	NOWAIT3
#include <sys/resource.h>
#endif
#if defined(IMA_RS6K) || defined(IMA_SP2MPI)
#include <sys/select.h>
#endif
#include <sys/stat.h>
#include <sys/socket.h>
#ifndef NOUNIXDOM
#include <sys/un.h>
#endif
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <fcntl.h>
#include <errno.h>
#include <stdio.h>
#include <signal.h>
#include <ctype.h>
/* Must come before local CINDEX macro definition */
#if defined(IMA_CSPP) && defined(BALANCED_SPAWN)
#include <sys/cnx_types.h>
#include <sys/cnx_sysinfo.h>
#include <sys/cnx_pattr.h>
#undef CINDEX
#endif
#ifdef	SYSVSTR
#include <string.h>
#define	CINDEX(s,c)	strchr(s,c)
#else
#include <strings.h>
#define	CINDEX(s,c)	index(s,c)
#endif
#include <netdb.h>
#include <pwd.h>

#include "global.h"
#include "fromlib.h"
#include "tdpro.h"
#include "ddpro.h"
#include <pvmsdpro.h>
#include "protoglarp.h"
#include "pvmalloc.h"
#include "host.h"
#include "pvmdabuf.h"
#include "pvmfrag.h"
#include "mesg.h"
#include "pkt.h"
#include "task.h"
#include "waitc.h"
#include "listmac.h"
#include "tvdefs.h"
#if defined(IMA_PGON) || defined(IMA_I860) || defined(IMA_CM5) || defined(SHMEM) || defined(IMA_SP2MPI)
#include "pvmdmp.h"
#endif
#include "bfunc.h"

#if defined(IMA_CRAY) || defined(IMA_CRAY2) || defined(IMA_I860) || defined(IMA_E88K) || defined(IMA_SCO)
#ifndef	MAXPATHLEN
#define	MAXPATHLEN	CANBSIZ
#endif
#endif

#ifndef	max
#define	max(a,b)	((a)>(b)?(a):(b))
#endif

#ifndef	min
#define	min(a,b)	((a)<(b)?(a):(b))
#endif

#ifndef	TTYDIS
#define	TTYDIS	0
#endif

#ifdef	STATISTICS
struct statistics {
	int selneg, selzer, selrdy;		/* neg, zero, ready selects */
	int rdneg, rdzer, rdok;			/* neg, zero, positive reads */
	int wrneg, wrzer, wrshr, wrok;	/* neg, zero, short, ok writes */
	int sdneg, sdok;				/* neg, ok sendtos */
	int rfok;						/* ok recvfroms */
	int refrag;						/* refragmented frags */
	int netret;						/* network resends */
};

struct statistics stats;
#endif

struct deaddata {
	int dd_pid;				/* process id */
	int dd_es;				/* unix exit status */
	struct timeval dd_ut;	/* user time used */
	struct timeval dd_st;	/* system time used */
};

void catch();
char *debug_flags();
char *pvmnametag();
void biteme();
void evilsig();
char *getenv();
char *inadport_decimal();
char *inadport_hex();
void pvmbailout();
char *pvmgethome();
char *pvmgetroot();
void reap();


/***************
 **  Globals  **
 **           **
 ***************/

#ifndef HASERRORVARS
extern int errno;
#endif

extern struct task *locltasks;	/* from task.c */
extern int log_fd;				/* from logging.c */
extern struct waitc *waitlist;	/* from waitc.c */

char **epaths = 0;				/* exec search path */
char *debugger = 0;				/* debugger executable */
int debugmask = 0;				/* which debugging info */
struct htab *filehosts = 0;		/* advisory host table from hostfile */
struct htab *hosts = 0;			/* active host table */
int hostertid = 0;				/* slave pvmd starter task */
char *loclsnam = 0;				/* t-d socket or addr file path */
int loclsock = -1;				/* pvmd-task master tcp socket */
#ifndef NOUNIXDOM
char *loclspath = 0;			/* t-d socket path */
#endif
char *myarchname = ARCHCLASS;
int myhostpart = 0;				/* host number shifted to tid host field */
int myndf = 0;					/* host native data enc */
int pvmmytid = 0;				/* this pvmd tid */
int myunixpid = -1;				/* pvmd pid */
int netsock = -1;				/* host-host udp socket */
int nopax = 1;					/* number of outstanding pkts on d-d link */
struct htab *newhosts = 0;		/* hosts being added by pvmd' */
struct htab *oldhosts = 0;		/* real host table (for pvmd') */
int ourudpmtu = UDPMAXLEN;		/* local UDP MTU */
int ppnetsock = -1;				/* pvmd' host-host udp socket */
int pprime = 0;					/* pvmd' pid for when we're forked */
int runstate = 0;				/* pvmd run state */
int pvmschedtid = 0;			/* scheduler task id */
int taskertid = 0;				/* task starter task */
int tidhmask = TIDHOST;			/* mask for host field of tids */
int tidlmask = TIDLOCAL;		/* mask for local field of tids */
int pvm_useruid = -1;			/* our uid */
char *username = 0;				/* our loginname */


/***************
 **  Private  **
 **           **
 ***************/

static char rcsid[] = "$Id: pvmd.c,v 1.41 1996/05/13 21:38:44 manchek Exp $";
static struct deaddata *deads = 0;	/* circ queue of dead task data */
static char pvmtxt[512];		/* scratch for error log */
static int ndead = 0;			/* len of deads */
static struct pkt *opq = 0;		/* outstanding pkt q to all hosts */
static int rdead = 0;			/* read ptr for deads */
static int slavemode = 0;		/* started by master pvmd */
static struct mesg *addmesg = 0;	/* message to self to add slaves */
static int wdead = 0;			/* write ptr for deads */
static fd_set wrk_rfds;			/* fd_sets for select() in work() */
static fd_set wrk_wfds;
/*
static fd_set wrk_efds;
*/
static int wrk_nfds = 0;		/* 1 + highest bit set in fds */


main(argc, argv)
	int argc;
	char **argv;
{
	int i, j;
	char *name = "";
	struct passwd *pe;
	int testmode = 0;
	struct timeval tnow;
	char buf[128];

	/* make sure 0, 1, 2 are in use */

	(void)open("/dev/null", O_RDONLY, 0);
	(void)open("/dev/null", O_RDONLY, 0);
	(void)open("/dev/null", O_RDONLY, 0);

	{
		char *p;
		if (p = getenv("PVMDDEBUG"))
			debugmask = pvmxtoi(p);
	}

	if ((pvm_useruid = getuid()) == -1) {
		pvmlogerror("main() can't getuid()\n");
		pvmbailout(0);
	}

	pvmlogopen();

	if (pe = getpwuid(pvm_useruid))
		username = STRALLOC(pe->pw_name);
	else
		pvmlogerror("main() can't getpwuid\n");
	endpwent();

	if ((myunixpid = getpid()) == -1) {
		pvmlogerror("main() can't getpid()\n");
		pvmbailout(0);
	}

	(void)pvmgetroot();		/* fail here if we can't */

	sprintf(buf, "PVM_ARCH=%s", myarchname);
	pvmputenv(STRALLOC(buf));

	myndf = pvm_archcode(myarchname);

#if defined(IMA_PGON) || defined(IMA_I860) || defined(IMA_CM5) || defined(IMA_SP2MPI)
	mpp_init(&argc, argv);
#endif
#ifdef SHMEM
	mpp_setmtu();
#endif

	for (i = j = 1; i < argc; i++) {
		if (argv[i][0] == '-') {
			switch (argv[i][1]) {

			case 'd':
				debugmask = pvmxtoi(argv[i] + 2);
				break;

			case 'n':
				name = argv[i] + 2;
				break;

			case 'S':
				argv[j++] = argv[i];
			case 's':
				slavemode = 1;
				break;

			case 't':
				testmode = 1;
				break;

			default:
				argv[j++] = argv[i];
			}

		} else {
			argv[j++] = argv[i];
		}
	}
	argc = j;

	if (debugmask) {
		sprintf(pvmtxt, "version %s\n", PVM_VER);
		pvmlogerror(pvmtxt);
		sprintf(pvmtxt, "ddpro %d tdpro %d\n", DDPROTOCOL, TDPROTOCOL);
		pvmlogerror(pvmtxt);
		sprintf(pvmtxt, "main() debugmask is %x (%s)\n",
				debugmask, debug_flags(debugmask));
		pvmlogerror(pvmtxt);
	}

	if (!*name) {
		if (gethostname(buf, sizeof(buf)-1) == -1) {
			pvmlogerror("main() can't gethostname()\n");
			pvmbailout(0);
		}
		name = buf;
	}
	if (testmode) {
		gettimeofday(&tnow, (struct timezone*)0);
		sprintf(pvmtxt, "version %s ddpro %d tdpro %d sdpro %d\n",
				PVM_VER, DDPROTOCOL, TDPROTOCOL, SDPROTOCOL);
		pvmlogerror(pvmtxt);
		pvmlogerror(ctime(&tnow.tv_sec));
		for (i = 0; i < argc; i++) {
			sprintf(pvmtxt, "argv[%d]=\"%s\"\n", i, argv[i]);
			pvmlogerror(pvmtxt);
		}
		exit(0);
	}
	if (slavemode)					/* slave pvmd */
		slave_config(name, argc, argv);

	else							/* master pvmd */
		master_config(name, argc, argv);

#if TTYDIS & 8
	setsid();
#endif
#if TTYDIS & 4
	setpgid(0, 0);
#endif
#if TTYDIS & 2
	setpgrp(0, 0);
#endif
#if TTYDIS & 1
	if ((i = open("/dev/tty", O_RDWR, 0)) != -1) {
		(void)ioctl(i, TIOCNOTTY, 0);
		(void)close(i);
	}
#endif

	myhostpart = hosts->ht_local << (ffs(tidhmask) - 1);
	pvmmytid = myhostpart | TIDPVMD;

	ndead = 1000;	/* XXX hum, static limit makes this easy to do */
/*
	deads = TALLOC(ndead, int, "pids");
*/
	deads = TALLOC(ndead, struct deaddata, "dead");
	BZERO((char*)deads, ndead * sizeof(struct deaddata));

#ifndef IMA_I860 /* this signal interferes with getcube() on I860 */
#ifdef	SYSVSIGNAL
	(void)signal(SIGCLD, reap);
#else
	(void)signal(SIGCHLD, reap);
#endif
#endif /*IMA_I860*/

	if (signal(SIGINT, SIG_IGN) != SIG_IGN)
		(void)signal(SIGINT, catch);
	if (signal(SIGTERM, SIG_IGN) != SIG_IGN)
		(void)signal(SIGTERM, catch);

	(void)signal(SIGHUP, SIG_IGN);
	(void)signal(SIGPIPE, SIG_IGN);

	(void)signal(SIGILL, evilsig);
	(void)signal(SIGFPE, evilsig);
#ifdef	SIGBUS
	(void)signal(SIGBUS, evilsig);
#endif
	(void)signal(SIGSEGV, evilsig);
#ifdef	SIGSYS
	(void)signal(SIGSYS, evilsig);
#endif

#ifdef	SIGDANGER
	(void)signal(SIGDANGER, biteme);
#endif

#ifdef	STATISTICS
	reset_statistics();
#endif
	task_init();
	wait_init();
	nmd_init();
#ifdef SHMEM
	mpp_init(&argc, argv);
#endif

	opq = pk_new(0);
	opq->pk_tlink = opq->pk_trlink = opq;

	/* print local socket address on stdout in case someone cares */

	if (!slavemode) {
		printf("%s\n", getenv("PVMSOCK"));
		fflush(stdout);
	}

/* XXX hack to start slaves automatically */

	if (!slavemode && filehosts) {
		struct hostd *hp;
		int hh;
		int n = 0;

		for (hh = filehosts->ht_last; hh >= 1; hh--)
			if ((hp = filehosts->ht_hosts[hh]) && !(hp->hd_flag & HF_NOSTART))
				n++;
		if (n) {
			addmesg = mesg_new(0);
			addmesg->m_cod = DM_ADD;
			pkint(addmesg, n);
			for (hh = 1; hh <= filehosts->ht_last; hh++)
				if ((hp = filehosts->ht_hosts[hh]) && !(hp->hd_flag & HF_NOSTART))
					pkstr(addmesg, hp->hd_name);
			addmesg->m_dst = TIDPVMD;
		}
	}

	work();
	pvmbailout(0);		/* not reached */
	exit(0);
}


static char *dflgs[] = {
	"pkt",	/* 1 */
	"msg",	/* 2 */
	"tsk",	/* 4 */
	"slv",	/* 8 */
	"hst",	/* 10 */
	"sel",	/* 20 */
	"net",	/* 40 */
	"mpp",	/* 80 */
	"sch",	/* 100 */
	"app",	/* 200 */
	"wai",	/* 400 */
	"mem",	/* 800 */
	"sem",	/* 1000 */
	"lck"	/* 2000 */
};

char *
debug_flags(mask)
	int mask;
{
	static char buf[64];
	int bit, i;

	buf[0] = 0;
	for (bit = 1, i = 0; i < sizeof(dflgs)/sizeof(dflgs[0]); i++, bit *= 2)
		if (mask & bit) {
			if (buf[0])
				strcat(buf, ",");
			strcat(buf, dflgs[i]);
		}
	return buf;
}


static char *ffnames[] = {
	"SOM", "EOM", "DAT", "FIN", "ACK"
};

char *
pkt_flags(ff)
	int ff;
{
	static char buf[64];
	int bit, i;

	buf[0] = 0;
	for (bit = 1, i = 0; i < sizeof(ffnames)/sizeof(ffnames[0]); i++, bit *= 2)
		if (ff & bit) {
			if (buf[0])
				strcat(buf, ",");
			strcat(buf, ffnames[i]);
		}
	if (!buf[0])
		strcpy(buf, "0");
	return buf;
}


void
evilsig(sig)
	int sig;
{
	if (runstate == PVMDISTASK)
		exit(sig);
	(void)signal(SIGILL, SIG_DFL);
	(void)signal(SIGFPE, SIG_DFL);
#ifdef	SIGBUS
	(void)signal(SIGBUS, SIG_DFL);
#endif
	(void)signal(SIGSEGV, SIG_DFL);
#ifdef	SIGSYS
	(void)signal(SIGSYS, SIG_DFL);
#endif
	(void)signal(SIGINT, SIG_DFL);
	(void)signal(SIGTERM, SIG_DFL);
	(void)signal(SIGHUP, SIG_DFL);
	(void)signal(SIGPIPE, SIG_DFL);
#ifdef	SYSVSIGNAL
	(void)signal(SIGCLD, SIG_DFL);
#else
	(void)signal(SIGCHLD, SIG_DFL);
#endif

	sprintf(pvmtxt, "evilsig() caught signal %d\n", sig);
	pvmlogerror(pvmtxt);
	i_dump(1);
/*
	abort();
*/
	pvmbailout(-sig);
}


void
catch(sig)
	int sig;
{
	if (runstate == PVMDISTASK)
		exit(sig);
	(void)signal(SIGINT, SIG_DFL);
	(void)signal(SIGTERM, SIG_DFL);
	sprintf(pvmtxt, "catch() caught signal %d\n", sig);
	pvmlogerror(pvmtxt);
	pvmbailout(sig);
}


#ifdef	SIGDANGER
void
biteme(sig)
	int sig;
{
	sprintf(pvmtxt, "biteme() caught signal %d and spaced it.\n", sig);
	pvmlogerror(pvmtxt);
	pvmlogerror("the mad fools, when will they learn?\n");
#ifdef	SYSVSIGNAL
	(void)signal(SIGDANGER, biteme);
#endif
}
#endif	/*SIGDANGER*/


/*	reap()
*
*	Child process has exited.  Put its pid in the fifo of tasks
*	to be cleaned up (in the work loop).
*/

void
reap(sig)
	int sig;
{
	int pid;
	int es = 0;
#ifndef NOWAIT3
#if defined(RUSAGE_SELF)
	struct rusage rus;
#else
	int rus;
#endif
#endif

	sig = sig;

#ifdef	NOWAIT3
#ifdef	NOWAITPID
	if ((pid = wait(&es)) > 0)
#else
	while ((pid = waitpid(-1, &es, WNOHANG)) > 0)
#endif
#else	/*NOWAIT3*/
	while ((pid = wait3(&es, WNOHANG, &rus)) > 0)
#endif	/*NOWAIT3*/
	{
#if !defined(NOWAIT3) && defined(RUSAGE_SELF)
		deads[wdead].dd_ut = rus.ru_utime;
		deads[wdead].dd_st = rus.ru_stime;
#else
		deads[wdead].dd_ut.tv_sec = 0;
		deads[wdead].dd_ut.tv_usec = 0;
		deads[wdead].dd_st.tv_sec = 0;
		deads[wdead].dd_st.tv_usec = 0;
#endif
		deads[wdead].dd_pid = pid;
		deads[wdead].dd_es = es;
		if (++wdead >= ndead)
			wdead = 0;
	}
#ifdef	SYSVSIGNAL
	(void)signal(SIGCLD, reap);
#endif
}


/*	pvmbailout()
*
*	We're hosed.  Clean up as much as possible and exit.
*/

void
pvmbailout(n)
	int n;
{
	struct task *tp;

	sprintf(pvmtxt, "pvmbailout(%d)\n", n);
	pvmlogerror(pvmtxt);

	/* sockaddr file */

	if (loclsnam)
		(void)unlink(loclsnam);

	/* kill local tasks */

#ifdef SHMEM
	mpp_cleanup();
#endif

	if (locltasks)
		for (tp = locltasks->t_link; tp != locltasks; tp = tp->t_link) {
			if (tp->t_pid)
				(void)kill(tp->t_pid, SIGTERM);
			if (tp->t_authnam)
				(void)unlink(tp->t_authnam);
		}

	/* shutdown slave pvmds / notify master */

	if (netsock != -1) {
		char dummy[DDFRAGHDR];
		int hh;
		struct hostd *hp;

		if (debugmask)
			pvmlogerror("sending FIN|ACK to all pvmds\n");
		for (hh = hosts->ht_last; hh >= 1; hh--)
			if ((hp = hosts->ht_hosts[hh]) && hp->hd_hostpart != myhostpart) {
				pvmput32(dummy, hp->hd_hostpart | TIDPVMD);
				pvmput32(dummy + 4, myhostpart | TIDPVMD);
				pvmput16(dummy + 8, 0);
				pvmput16(dummy + 10, 0);
				pvmput8(dummy + 12, FFFIN|FFACK);
				sendto(netsock, dummy, DDFRAGHDR, 0,
						(struct sockaddr*)&hp->hd_sad, sizeof(hp->hd_sad));
			}
	}

#ifndef NOUNIXDOM
	if (loclspath)
		(void)unlink(loclspath);
#endif

	if (n < 0)
		abort();
	exit(n);
}


wrk_fds_init()
{
	wrk_nfds = 0;
	FD_ZERO(&wrk_rfds);
	FD_ZERO(&wrk_wfds);
/*
	FD_ZERO(&wrk_efds);
*/
}


wrk_fds_add(fd, sets)
	int fd;				/* the fd */
	int sets;			/* which sets */
{
#ifdef	SANITY
	if (fd < 0 || fd >= FD_SETSIZE) {
		sprintf(pvmtxt, "wrk_fds_add() bad fd %d\n", fd);
		pvmlogerror(pvmtxt);
		return 1;
	}
#endif
	if (sets & 1)
		FD_SET(fd, &wrk_rfds);
	if (sets & 2)
		FD_SET(fd, &wrk_wfds);
/*
	if (sets & 4)
		FD_SET(fd, &wrk_efds);
*/

	/* if this is new highest, adjust nfds */

	if (fd >= wrk_nfds)
		wrk_nfds = fd + 1;
	return 0;
}


wrk_fds_delete(fd, sets)
	int fd;				/* the fd */
	int sets;			/* which sets */
{
#ifdef	SANITY
	if (fd < 0 || fd >= FD_SETSIZE) {
		sprintf(pvmtxt, "wrk_fds_delete() bad fd %d\n", fd);
		pvmlogerror(pvmtxt);
		return 1;
	}
#endif
	if (sets & 1)
		FD_CLR(fd, &wrk_rfds);
	if (sets & 2)
		FD_CLR(fd, &wrk_wfds);
/*
	if (sets & 4)
		FD_CLR(fd, &wrk_efds);
*/

	/* if this was highest, may have to adjust nfds to new highest */

	if (fd + 1 == wrk_nfds)
		while (wrk_nfds > 0) {
			wrk_nfds--;
			if (FD_ISSET(wrk_nfds, &wrk_rfds)
			|| FD_ISSET(wrk_nfds, &wrk_wfds)
/*
			|| FD_ISSET(wrk_nfds, &wrk_efds)
*/
			) {
				wrk_nfds++;
				break;
			}
		}
	return 0;
}


print_fdset(pad, n, f)
	char *pad;		/* label at head */
	int n;			/* max fd + 1 */
	fd_set *f;		/* fd set */
{
	char *p = pvmtxt;
	int i;
	char *s = "";

	strcpy(p, pad);
	p += strlen(p);
	for (i = 0; i < n; i++)
		if (FD_ISSET(i, f)) {
			sprintf(p, "%s%d", s, i);
			p += strlen(p);
			s = ",";
		}
	strcat(p, "\n");
	pvmlogerror(pvmtxt);
}


/*	clear_opq_of()
*
*	Clear packets dst for host in opq but _not_ in host hd_opq.
*/

int
clear_opq_of(tid)
	int tid;			/* host */
{
	struct pkt *pp, *pp2;

	for (pp = opq->pk_tlink; pp != opq; pp = pp->pk_tlink) {
		if (pp->pk_dst == tid && !pp->pk_link) {
			pp2 = pp->pk_trlink;
			LISTDELETE(pp, pk_tlink, pk_trlink);
			pk_free(pp);
			pp = pp2;
		}
	}
	return 0;
}


/*	work()
*
*	The whole sausage
*/

work()
{
	static int lastpinged = 0;	/* host that got last keepalive message */
	fd_set rfds, wfds;			/* result of select */
/*
	fd_set efds;
*/
	int nrdy;					/* number of fds ready after select */
	struct timeval tbail;		/* time to bail if state = STARTUP */
	struct timeval tping;		/* time to send next keepalive packet */
	struct timeval tnow;
	struct timeval tout;
	struct mesg *mp;
	struct task *tp;
	struct hostd *hp;
#if defined(IMA_PGON) || defined(IMA_I860) || defined(SHMEM)
	int nodemsg = 0;
#endif
#ifdef	SHMEM
	int someclosed;
#endif

	gettimeofday(&tnow, (struct timezone*)0);
	if (debugmask || myhostpart) {
		sprintf(pvmtxt, "%s (%s) %s %s\n",
				hosts->ht_hosts[hosts->ht_local]->hd_name,
				inadport_decimal(&hosts->ht_hosts[hosts->ht_local]->hd_sad),
				myarchname,
				PVM_VER);
		pvmlogerror(pvmtxt);
		sprintf(pvmtxt, "ready %s", ctime(&tnow.tv_sec));
		pvmlogerror(pvmtxt);
	}

	/*
	* remind myself to start those pesky slave pvmds
	*/

	if (addmesg) {
		struct mesg *mp = addmesg;

		addmesg = 0;
		sendmessage(mp);
	}

	/*
	* init bail (for PVMDSTARTUP) and ping times
	*/

	tout.tv_sec = DDBAILTIME;
	tout.tv_usec = 0;
	TVXADDY(&tbail, &tnow, &tout);

	tout.tv_sec = DDPINGTIME;
	tout.tv_usec = 0;
	TVXADDY(&tping, &tnow, &tout);

	/* init select fd sets */

	wrk_fds_init();

	if (loclsock >= 0)
		wrk_fds_add(loclsock, 1);
	wrk_fds_add(netsock, 1);

	for (; ; ) {

		/*
		*	clean up after any tasks that we got SIGCHLDs for
		*/
		while (rdead != wdead) {
			if (deads[rdead].dd_pid == pprime) {
				int cc;
				int oslen;
				struct sockaddr_in osad;
				struct timeval t;
				char buf[DDFRAGHDR];

				hostfailentry(hosts->ht_hosts[0]);
				clear_opq_of((int)(TIDPVMD | hosts->ht_hosts[0]->hd_hostpart));
				pprime = 0;

				while (1) {
					FD_ZERO(&rfds);
					FD_SET(ppnetsock, &rfds);
					t.tv_sec = 0;
					t.tv_usec = 0;
					cc = select(ppnetsock + 1,
#ifdef	FDSETISINT
							(int *)&rfds, (int *)0, (int *)0,
#else
							&rfds, (fd_set *)0, (fd_set *)0,
#endif
							&t);
					if (cc == 1) {
						oslen = sizeof(osad);
						recvfrom(ppnetsock, buf, sizeof(buf),
								0, (struct sockaddr*)&osad, &oslen);

					} else if (cc != -1 || errno != EINTR)
						break;
				}

			} else {
				if (tp = task_findpid(deads[rdead].dd_pid)) {

		/* check for output one last time
		   XXX this could be cleaner by going through main select again
		   XXX before flushing the task */

					tp->t_status = deads[rdead].dd_es;
					tp->t_utime = deads[rdead].dd_ut;
					tp->t_stime = deads[rdead].dd_st;
					while (tp->t_out >= 0) {
						fd_set rfds;

						FD_ZERO(&rfds);
						FD_SET(tp->t_out, &rfds);
						TVCLEAR(&tout);
						if (select(tp->t_out + 1,
#ifdef	FDSETISINT
								(int *)&rfds, (int *)0, (int *)0,
#else
								&rfds, (fd_set *)0, (fd_set *)0,
#endif
								&tout) == 1)
							loclstout(tp);

						else
							break;
					}
#if defined(IMA_PGON)
					mpp_free(tp);
#endif
					task_cleanup(tp);
					task_free(tp);
				}
			}
			if (++rdead >= ndead)
				rdead = 0;
		}

		netoutput();

		if (runstate == PVMDHALTING) {
			pvmlogerror("work() pvmd halting\n");
			pvmbailout(0);
		}

		/* bail if new slave and haven't been configured for too long */
		gettimeofday(&tnow, (struct timezone*)0);
		if (runstate == PVMDSTARTUP && TVXLTY(&tbail, &tnow)) {
			pvmlogerror("work() run = STARTUP, timed out waiting for master\n");
			pvmbailout(0);
		}

		/*
		* send keepalive message to remote pvmd once in a while
		*/
		if (TVXLTY(&tping, &tnow)) {
			if (debugmask & (PDMPACKET|PDMSELECT))
				pvmlogerror("work() ping timer\n");
			if (runstate == PVMDNORMAL || runstate == PVMDHTUPD) {
				do {
					if (++lastpinged > hosts->ht_last)
						lastpinged = 1;
				} while (!(hp = hosts->ht_hosts[lastpinged]));

				if (hp->hd_hostpart != myhostpart
				&& hp->hd_txq->pk_link == hp->hd_txq) {
					mp = mesg_new(0);
					mp->m_cod = DM_NULL;
					mp->m_dst = hp->hd_hostpart | TIDPVMD;
					sendmessage(mp);
				}
			}
			tout.tv_sec = DDPINGTIME;
			tout.tv_usec = 0;
			TVXADDY(&tping, &tnow, &tout);
		}

		/*
		* figure select timeout
		*/

#if !defined(IMA_PGON) && !defined(IMA_I860)
		if (opq->pk_tlink == opq)
			tout = tping;
		else
			tout = opq->pk_tlink->pk_rtv;

		if (TVXLTY(&tout, &tnow)) {
			TVCLEAR(&tout);

		} else {
			TVXSUBY(&tout, &tout, &tnow);
		}

		if (debugmask & PDMSELECT) {
			sprintf(pvmtxt, "work() select tout is %d.%06d\n",
					tout.tv_sec, tout.tv_usec);
			pvmlogerror(pvmtxt);
		}

#endif	/*!defined(IMA_PGON) && !defined(IMA_I860)*/

#ifdef SHMEM
		if ((nodemsg = mpp_probe()) == 1) {
			mpp_input();
			TVCLEAR(&tout);
		}
#endif

		rfds = wrk_rfds;
		wfds = wrk_wfds;
/*
		efds = wrk_efds;
*/
		if (debugmask & PDMSELECT) {
			sprintf(pvmtxt, "work() wrk_nfds=%d\n", wrk_nfds);
			pvmlogerror(pvmtxt);
			print_fdset("work() rfds=", wrk_nfds, &rfds);
			print_fdset("work() wfds=", wrk_nfds, &wfds);
		}

#if !defined(IMA_PGON) && !defined(IMA_I860)

		if ((nrdy = select(wrk_nfds,
#ifdef	FDSETISINT
				(int *)&rfds, (int *)&wfds, (int *)0,
#else
				&rfds, &wfds, (fd_set *)0,
#endif
				&tout)) == -1) {
			if (errno != EINTR) {
				pvmlogperror("work() select");
				sprintf(pvmtxt, " wrk_nfds=%d\n", wrk_nfds);
				pvmlogerror(pvmtxt);
				print_fdset(" rfds=", wrk_nfds, &wrk_rfds);
				print_fdset(" wfds=", wrk_nfds, &wrk_wfds);
				sprintf(pvmtxt, " netsock=%d, ppnetsock=%d, loclsock=%d\n",
						netsock, ppnetsock, loclsock);
				pvmlogerror(pvmtxt);
				task_dump();
				pvmbailout(0);
			}
		}

#else /*IMA_PGON/IMA_I860*/

		do {
			if ((nodemsg = mpp_probe()) == 1) {
				mpp_input();
				TVCLEAR(&tout);

			} else {
				tout.tv_sec = 0;
				tout.tv_usec = TIMEOUT;
			}
			rfds = wrk_rfds;
			wfds = wrk_wfds;
			if ((nrdy = select(wrk_nfds,
#ifdef	FDSETISINT
					(int *)&rfds, (int *)&wfds, (int *)0,
#else
					&rfds, &wfds, (fd_set *)0,
#endif
					&tout))
			== -1) {
				if (errno != EINTR) {
					pvmlogperror("work() select");
					pvmbailout(0);
				}
			}
		} while(!(nrdy || nodemsg));

#endif /*IMA_PGON/IMA_I860*/

#ifdef	STATISTICS
		switch (nrdy) {
		case -1:
			stats.selneg++;
			break;
		case 0:
			stats.selzer++;
			break;
		default:
			stats.selrdy++;
			break;
		}
#endif
		if (debugmask & PDMSELECT) {
			sprintf(pvmtxt, "work() SELECT returns %d\n", nrdy);
			pvmlogerror(pvmtxt);
		}

	/*
	*	check network socket and local master socket for action
	*/

		if (nrdy > 0) {
			if (FD_ISSET(netsock, &rfds)) {
				nrdy--;
				netinput();
			}
			if (loclsock >= 0 && FD_ISSET(loclsock, &rfds)) {
				nrdy--;
				loclconn();
			}
		}

	/*
	*	check tasks for action
	*/

#ifdef	SHMEM
		someclosed = 0;
#endif
		if (loclsock >= 0) {
			for (tp = locltasks->t_link;
					nrdy > 0 && tp != locltasks;
					tp = tp->t_link) {

				if (tp->t_sock >= 0 && FD_ISSET(tp->t_sock, &rfds)) {
					FD_CLR(tp->t_sock, &rfds);
					nrdy--;
					if (loclinput(tp)) {
#ifdef	SHMEM
						if (tp->t_tid == 0)
							someclosed++;
#endif
						if (debugmask & PDMTASK) {
							sprintf(pvmtxt,
									"work() error reading from t%x, marking dead\n",
									tp->t_tid);
							pvmlogerror(pvmtxt);
						}
						if (!(tp->t_flag & TF_FORKD)) {
							tp = tp->t_rlink;
							task_cleanup(tp->t_link);
							task_free(tp->t_link);

						} else
							wrk_fds_delete(tp->t_sock, 3);
						continue;
					}
				}

				if (tp->t_sock >= 0 && FD_ISSET(tp->t_sock, &wfds)) {
					FD_CLR(tp->t_sock, &wfds);
					nrdy--;
					if (locloutput(tp)) {
#ifdef	SHMEM
						if (tp->t_tid == 0)
							someclosed++;
#endif
						if (!(tp->t_flag & TF_FORKD)) {
							tp = tp->t_rlink;
							task_cleanup(tp->t_link);
							task_free(tp->t_link);

						} else
							wrk_fds_delete(tp->t_sock, 3);
						continue;
					}
				}

				if (tp->t_out >= 0 && FD_ISSET(tp->t_out, &rfds)) {
					FD_CLR(tp->t_out, &rfds);
					nrdy--;
					loclstout(tp);
				}
			}
		}
#if defined(IMA_CM5) || defined(IMA_SP2MPI)
		mpp_output((struct task *)0, (struct pkt *)0);
#endif
#ifdef	SHMEM
		if (someclosed)
			mpp_dredge();
#endif
	}
}


/*	netoutput()
*
*	Send packets out the wire to remote pvmds.
*/

netoutput()
{
	struct timeval tnow, tx;
	struct pkt *pp, *pp2;
	struct hostd *hp;
	char *cp;
	int len;
	int cc;
	char dummy[DDFRAGHDR];

/*
	len = 0;
	for (pp = opq->pk_tlink; pp != opq; pp = pp->pk_tlink)
		len++;
	sprintf(pvmtxt, "netoutput() %d in opq\n", len);
	pvmlogerror(pvmtxt);
*/
	if (opq->pk_tlink == opq)
		return 0;

	/*
	* send any pkts whose time has come
	*/

	gettimeofday(&tnow, (struct timezone*)0);

	while ((pp = opq->pk_tlink) != opq && TVXLTY(&pp->pk_rtv, &tnow)) {

	/*
	* fail if we've tried too hard
	*/
		hp = pp->pk_hostd;
		if (pp->pk_nrt >= DDMINRETRIES
		&& pp->pk_rto.tv_sec >= DDMINTIMEOUT) {		/* host is toast */
			sprintf(pvmtxt,
					"netoutput() timed out sending to %s after %d, %d.%06d\n",
					hp->hd_name, pp->pk_nrt,
					pp->pk_rto.tv_sec, pp->pk_rto.tv_usec);
			pvmlogerror(pvmtxt);
			hd_dump(hp);
			hostfailentry(hp);
			clear_opq_of((int)(TIDPVMD | hp->hd_hostpart));
			ht_delete(hosts, hp);
			if (newhosts)
				ht_delete(newhosts, hp);
			continue;
		}

		cp = pp->pk_dat;
		len = pp->pk_len;
		if (pp->pk_flag & FFSOM) {
			cp -= TTMSGHDR;
			len += TTMSGHDR;
			if (cp < pp->pk_buf) {
				pvmlogerror("netoutput() no headroom for message header\n");
				return 0;
			}
			pvmput32(cp, pp->pk_cod);
			pvmput32(cp + 4, pp->pk_enc);
			pvmput32(cp + 8, pp->pk_wid);
			pvmput32(cp + 12, pp->pk_crc);
		}
		cp -= DDFRAGHDR;
		len += DDFRAGHDR;

	/*
	* save under packet header, because databuf may be shared.
	* we don't worry about message header, because it's only at the head.
	*/
		BCOPY(cp, dummy, sizeof(dummy));
		if (cp < pp->pk_buf) {
			pvmlogerror("netoutput() no headroom for packet header\n");
			return 0;
		}

		if (debugmask & PDMPACKET) {
			sprintf(pvmtxt,
			"netoutput() pkt to %s src t%x dst t%x f %s len %d seq %d ack %d retry %d\n",
					hp->hd_name, pp->pk_src, pp->pk_dst, pkt_flags(pp->pk_flag),
					pp->pk_len, pp->pk_seq, pp->pk_ack, pp->pk_nrt);
			pvmlogerror(pvmtxt);
		}
		pvmput32(cp, pp->pk_dst);
		pvmput32(cp + 4, pp->pk_src);
		pvmput16(cp + 8, pp->pk_seq);
		pvmput16(cp + 10, pp->pk_ack);
		pvmput32(cp + 12, 0);			/* to keep purify happy */
		pvmput8(cp + 12, pp->pk_flag);
#if 0
		/* drop (don't send) random packets */
		if (!(random() & 3)) {
			pvmlogerror("netoutput() darn, dropped one\n");
			cc = -1;
		} else
#endif
			if ((cc = sendto(netsock, cp, len, 0,
					(struct sockaddr*)&hp->hd_sad, sizeof(hp->hd_sad))) == -1
			&& errno != EINTR
			&& errno != ENOBUFS
#ifdef	IMA_LINUX
			&& errno != ENOMEM
#endif
			) {
				pvmlogperror("netoutput() sendto");
#ifdef	ENETDOWN
				if (errno == ENETDOWN) ;
				else
#endif
#ifdef	ENETUNREACH
				if (errno == ENETUNREACH) ;
				else
#endif
#ifdef	EHOSTDOWN
				if (errno == EHOSTDOWN) ;
				else
#endif
#ifdef	EHOSTUNREACH
				if (errno == EHOSTUNREACH) ;
				else
#endif
#if defined(IMA_SUN4SOL2) || defined(IMA_X86SOL2) || defined(IMA_SUNMP) || defined(IMA_UXPM)
	/* life, don't talk to me about life... */
				if (errno == ECHILD)
					pvmlogerror("this message brought to you by solaris\n");
				else
#endif
				pvmbailout(0);
			}
#ifdef	STATISTICS
		if (cc == -1)
			stats.sdneg++;
		else
			stats.sdok++;
#endif

		BCOPY(dummy, cp, sizeof(dummy));	/* restore under header */

	/*
	* set timer for next retry
	*/
		if (cc != -1) {
			if ((pp->pk_flag & (FFFIN|FFACK)) == (FFFIN|FFACK)) {
				pk_free(pp);
				if (hp != hosts->ht_hosts[0]) {
					hostfailentry(hp);
					clear_opq_of((int)(TIDPVMD | hp->hd_hostpart));
					ht_delete(hosts, hp);
					if (newhosts)
						ht_delete(newhosts, hp);
				}
				continue;
			}
			if (!((pp->pk_flag & FFDAT)
					|| (pp->pk_flag & (FFFIN|FFACK)) == FFFIN)) {
				pk_free(pp);
				continue;
			}
			if (!TVISSET(&pp->pk_at))
				pp->pk_at = tnow;
			TVXADDY(&pp->pk_rtv, &tnow, &pp->pk_rta);
			TVXADDY(&pp->pk_rto, &pp->pk_rto, &pp->pk_rta);
#ifdef	STATISTICS
			if (pp->pk_nrt)
				stats.netret++;
#endif
			++pp->pk_nrt;
			if (pp->pk_rta.tv_sec < DDMAXRTT) {
				TVXADDY(&pp->pk_rta, &pp->pk_rta, &pp->pk_rta);
			}

		} else {
			tx.tv_sec = DDERRRETRY/1000000;
			tx.tv_usec = DDERRRETRY%1000000;
			TVXADDY(&pp->pk_rtv, &tnow, &tx);
			TVXADDY(&pp->pk_rto, &pp->pk_rto, &tx);
		}

		/* reinsert packet into opq */

		LISTDELETE(pp, pk_tlink, pk_trlink);
		for (pp2 = opq->pk_trlink; pp2 != opq; pp2 = pp2->pk_trlink)
			if (TVXLTY(&pp2->pk_rtv, &pp->pk_rtv))
				break;
		LISTPUTAFTER(pp2, pp, pk_tlink, pk_trlink);
	}
	return 0;
}


/*	netinput()
*
*	Input from a remote pvmd.
*	Accept a packet, do protocol stuff then pass pkt to netinpkt().
*/

int
netinput()
{
	struct sockaddr_in osad;		/* sender's ip addr */
	int oslen;						/* sockaddr length */
	struct timeval tnow;
	struct pkt *pp, *pp2;
	struct hostd *hp;
	char *cp;
	int sqn;
	int aqn;
	int ff;
	int dst;
	int src;
	int hh;
	int already;
	struct timeval tdiff;			/* packet rtt */
	int rttusec;

	/*
	* alloc new pkt buffer and read packet
	*/

	pp = pk_new(ourudpmtu);
	if (TDFRAGHDR > DDFRAGHDR)
		pp->pk_dat += TDFRAGHDR - DDFRAGHDR;

	oslen = sizeof(osad);
	if ((pp->pk_len = recvfrom(netsock, pp->pk_dat,
			pp->pk_max - (pp->pk_dat - pp->pk_buf),
			0, (struct sockaddr*)&osad, &oslen)) == -1) {
		if (errno != EINTR)
			pvmlogperror("netinput() recvfrom(netsock)");
		goto scrap;
	}

#if 0
	/* drop random packets */
	if (!(random() & 3)) {
		pvmlogerror("netinput() oops, dropped one\n");
		goto scrap;
	}
#endif

#ifdef	STATISTICS
	stats.rfok++;
#endif

	cp = pp->pk_dat;
	pp->pk_len -= DDFRAGHDR;
	pp->pk_dat += DDFRAGHDR;
	dst = pp->pk_dst = pvmget32(cp);
	src = pp->pk_src = pvmget32(cp + 4);
	sqn = pp->pk_seq = pvmget16(cp + 8);
	aqn = pvmget16(cp + 10);
	ff = pp->pk_flag = pvmget8(cp + 12);
	if (ff & FFSOM) {
		if (pp->pk_len < TTMSGHDR) {
			sprintf(pvmtxt, "netinput() SOM pkt src t%x dst t%x too short\n",
					src, dst);
			pvmlogerror(pvmtxt);
			goto scrap;
		}
		cp += DDFRAGHDR;
		pp->pk_cod = pvmget32(cp);
		pp->pk_enc = pvmget32(cp + 4);
		pp->pk_wid = pvmget32(cp + 8);
		pp->pk_crc = pvmget32(cp + 12);
		pp->pk_len -= TTMSGHDR;
		pp->pk_dat += TTMSGHDR;
	}

	/*
	* make sure it's from where it claims
	*/

	hh = (src & tidhmask) >> (ffs(tidhmask) - 1);
	if (hh < 0 || hh > hosts->ht_last || !(hp = hosts->ht_hosts[hh])
#ifndef IMA_LINUX
	/*
	* XXX removing these lines is a hack and reduces security between
	* XXX pvmds somewhat, but it's the easiest fix for Linux right now.
	*/
	|| (osad.sin_addr.s_addr != hp->hd_sad.sin_addr.s_addr)
	|| (osad.sin_port != hp->hd_sad.sin_port)
#endif
	) {
		sprintf(pvmtxt, "netinput() bogus pkt from %s\n",
				inadport_decimal(&osad));
		pvmlogerror(pvmtxt);
		goto scrap;
	}

	if (debugmask & PDMPACKET) {
		sprintf(pvmtxt,
		"netinput() pkt from %s src t%x dst t%x f %s len %d seq %d ack %d\n",
				hp->hd_name, src, dst, pkt_flags(ff), pp->pk_len, sqn, aqn);
		pvmlogerror(pvmtxt);
	}

	if ((ff & (FFFIN|FFACK)) == (FFFIN|FFACK)) {
		if (hh == hosts->ht_master) {
	/*
	* FIN|ACK from master means we should bailout
	*/
			if (runstate == PVMDPRIME) {
				if (debugmask & PDMSTARTUP)
					pvmlogerror("work() PVMDPRIME halting\n");
				exit(0);
			}
			sprintf(pvmtxt, "netinput() FIN|ACK from master (%s)\n",
					hp->hd_name);
			pvmlogerror(pvmtxt);
			runstate = PVMDHALTING;

		} else {
	/*
	* FIN|ACK from slave means it croaked
	*/
			sprintf(pvmtxt, "netinput() FIN|ACK from %s\n",
					hp->hd_name);
			pvmlogerror(pvmtxt);
			hd_dump(hp);
			hostfailentry(hp);
			clear_opq_of((int)(TIDPVMD | hp->hd_hostpart));
			if (hp->hd_hostpart) {
				ht_delete(hosts, hp);
				if (newhosts)
					ht_delete(newhosts, hp);
			}
		}
		goto scrap;
	}

	/*
	* done with outstanding packet covered by this ack
	*/

	if (ff & FFACK) {
		for (pp2 = hp->hd_opq->pk_link; pp2 != hp->hd_opq; pp2 = pp2->pk_link)
			if (pp2->pk_seq == aqn) {
				if (pp2->pk_flag & FFDAT) {
					if (pp2->pk_nrt == 1) {
						gettimeofday(&tnow, (struct timezone*)0);

						TVXSUBY(&tdiff, &tnow, &pp2->pk_at);
						rttusec = tdiff.tv_sec * 1000000 + tdiff.tv_usec;
						if (rttusec < 1)
							rttusec = 1000;	/* XXX const */
						else
							if (rttusec > DDMAXRTT*1000000)
								rttusec = DDMAXRTT*1000000;
						rttusec += 3 * (hp->hd_rtt.tv_sec * 1000000 + hp->hd_rtt.tv_usec);
						rttusec /= 4;
						hp->hd_rtt.tv_sec = rttusec / 1000000;
						hp->hd_rtt.tv_usec = rttusec % 1000000;
					}
				}
				if (pp2->pk_flag & FFFIN) {
					finack_to_host(hp);
				}
				hp->hd_nop--;
				LISTDELETE(pp2, pk_link, pk_rlink);
				pk_free(pp2);
				break;
			}
	}

	/*
	* move another pkt to output q
	*/

/*
	if ((hp->hd_opq->pk_link == hp->hd_opq)
*/
	if (hp->hd_nop < nopax
	&& (hp->hd_txq->pk_link != hp->hd_txq)) {
		if (debugmask & PDMPACKET) {
			sprintf(pvmtxt, "netinput() pkt to opq\n");
			pvmlogerror(pvmtxt);
		}
		pp2 = hp->hd_txq->pk_link;
		LISTDELETE(pp2, pk_link, pk_rlink);
		TVCLEAR(&pp2->pk_rtv);
		TVXADDY(&pp2->pk_rta, &hp->hd_rtt, &hp->hd_rtt);
		TVCLEAR(&pp2->pk_rto);
		TVCLEAR(&pp2->pk_at);
		pp2->pk_nrt = 0;
		pp2->pk_hostd = hp;
		pp2->pk_seq = hp->hd_txseq;
		hp->hd_txseq = NEXTSEQNUM(hp->hd_txseq);
		LISTPUTBEFORE(hp->hd_opq, pp2, pk_link, pk_rlink);
		hp->hd_nop++;
		LISTPUTAFTER(opq, pp2, pk_tlink, pk_trlink);
	}

	if (!(ff & (FFDAT|FFFIN)))
		goto scrap;

	/*
	* send an ack for the pkt
	*/

	pp2 = pk_new(DDFRAGHDR);	/* XXX could reref a dummy databuf here */
	pp2->pk_dat += DDFRAGHDR;
	pp2->pk_dst = hp->hd_hostpart | TIDPVMD;
	pp2->pk_src = pvmmytid;
	pp2->pk_flag = FFACK;
	TVCLEAR(&pp2->pk_rtv);
	TVCLEAR(&pp2->pk_rta);
	TVCLEAR(&pp2->pk_rto);
	TVCLEAR(&pp2->pk_at);
	pp2->pk_nrt = 0;
	pp2->pk_hostd = hp;
	pp2->pk_seq = 0;
	pp2->pk_ack = sqn;
	LISTPUTAFTER(opq, pp2, pk_tlink, pk_trlink);

	if (!(ff & FFDAT))
		goto scrap;

	/*
	* if we don't have it already, put it in reordering q
	*/

	pp2 = 0;
	if (SEQNUMCOMPARE(sqn, hp->hd_rxseq))
		already = 1;
	else {
		already = 0;
		for (pp2 = hp->hd_rxq->pk_link; pp2 != hp->hd_rxq; pp2 = pp2->pk_link)
			if (pp2->pk_seq >= sqn) {
				if (pp2->pk_seq == sqn)
					already = 1;
				break;
			}
	}
	if (already) {
		if (debugmask & PDMPACKET) {
			sprintf(pvmtxt, "netinput() pkt resent from %s seq %d\n",
					hp->hd_name, sqn);
			pvmlogerror(pvmtxt);
		}
		goto scrap;
	}

	LISTPUTBEFORE(pp2, pp, pk_link, pk_rlink);

	/*
	* accept pkts from reordering q
	*/

	while (pp = hp->hd_rxq->pk_link,
			pp != hp->hd_rxq && pp->pk_seq == hp->hd_rxseq) {
		hp->hd_rxseq = NEXTSEQNUM(hp->hd_rxseq);
		LISTDELETE(pp, pk_link, pk_rlink);
		netinpkt(hp, pp);
	}
	return 0;

scrap:
	if (pp)
		pk_free(pp);
	return 0;
}


/*	netinpkt()
*
*	Consume pkt from network.  It's either for the pvmd and needs to
*	be reassembled into a message or it's for a local task and needs
*	to be put on the queue to be sent.
*/

netinpkt(hp, pp)
	struct hostd *hp;
	struct pkt *pp;
{
	struct mca *mcap = 0;
	struct task *tp;
	struct mesg *mp;
	struct frag *fp;
	struct pkt *pp2;
	int src = pp->pk_src;
	int dst = pp->pk_dst;
	int ff = pp->pk_flag;
	char *cp;
	int i;

	if (debugmask & PDMPACKET) {
		sprintf(pvmtxt,
		"netinpkt() pkt from %s src t%x dst t%x f %s len %d\n",
				hp->hd_name, src, dst, pkt_flags(ff), pp->pk_len);
		pvmlogerror(pvmtxt);
	}

	/* throw out packet if it's not for us */

	if (TIDISMCA(dst)) {
		for (mcap = hp->hd_mcas->mc_link; mcap != hp->hd_mcas;
				mcap = mcap->mc_link)
			if (mcap->mc_tid == dst)
				break;
		if (mcap == hp->hd_mcas)
			mcap = 0;
	}

	if ((dst & tidhmask) != myhostpart && !mcap) {
		if (debugmask & (PDMPACKET|PDMAPPL)) {
			sprintf(pvmtxt,
					"netinpkt() pkt from t%x for t%x scrapped (not us)\n",
					src, dst);
			pvmlogerror(pvmtxt);
		}
		goto done;
	}

	if (mcap) {

#if	defined(IMA_PGON) || defined(IMA_I860)
		mpp_mcast(pp, mcap->mc_dsts, mcap->mc_ndst);
#else	/*defined(IMA_PGON) || defined(IMA_I860) */

		for (i = mcap->mc_ndst; i-- > 0; ) {
			dst = mcap->mc_dsts[i];
			if (tp = task_find(dst)) {		/* to local task */
				pp2 = pk_new(0);
				pp2->pk_src = src;
				pp2->pk_dst = dst;
				pp2->pk_flag = ff;
				pp2->pk_cod = pp->pk_cod;
				pp2->pk_enc = pp->pk_enc;
				pp2->pk_wid = pp->pk_wid;
				pp2->pk_crc = pp->pk_crc;
				pp2->pk_buf = pp->pk_buf;
				pp2->pk_max = pp->pk_max;
				pp2->pk_dat = pp->pk_dat;
				pp2->pk_len = pp->pk_len;
				da_ref(pp->pk_buf);

				pkt_to_task(tp, pp2);

			} else
				if (debugmask & (PDMPACKET|PDMAPPL)) {
					sprintf(pvmtxt,
					"netinpkt() mc pkt from t%x for t%x scrapped (no dst)\n",
							src, dst);
					pvmlogerror(pvmtxt);
				}
		}

#endif	/*defined(IMA_PGON) || defined(IMA_I860) */

		if (ff & FFEOM) {
			if (debugmask & PDMMESSAGE) {
				sprintf(pvmtxt, "netinpkt() freed mca %x from t%x\n",
						mcap->mc_tid, hp->hd_name);
				pvmlogerror(pvmtxt);
			}
			mca_free(mcap);
		}
		goto done;
	}

	if ((dst & ~tidhmask) == TIDPVMD) {		/* for pvmd */
		if (ff & FFSOM) {			/* start of message */
			if (hp->hd_rxm) {
				sprintf(pvmtxt, "netinpkt() repeated start pkt from %s\n",
						hp->hd_name);
				pvmlogerror(pvmtxt);
				goto done;
			}
			hp->hd_rxm = mesg_new(0);
			hp->hd_rxm->m_cod = pp->pk_cod;
			hp->hd_rxm->m_enc = pp->pk_enc;
			hp->hd_rxm->m_wid = pp->pk_wid;
			hp->hd_rxm->m_crc = pp->pk_crc;
			hp->hd_rxm->m_dst = dst;
			hp->hd_rxm->m_src = src;

		} else {					/* middle or end of message */
			if (!hp->hd_rxm) {
				sprintf(pvmtxt,
						"netinpkt() spurious pkt (no message) from %s\n",
						hp->hd_name);
				pvmlogerror(pvmtxt);
				goto done;
			}
		}

		fp = fr_new(0);
		fp->fr_buf = pp->pk_buf;
		fp->fr_dat = pp->pk_dat;
		fp->fr_max = pp->pk_max;
		fp->fr_len = pp->pk_len;
		da_ref(pp->pk_buf);
		LISTPUTBEFORE(hp->hd_rxm->m_frag, fp, fr_link, fr_rlink);
		hp->hd_rxm->m_len += fp->fr_len;

		if (ff & FFEOM) {		/* end of message */
			mp = hp->hd_rxm;
			hp->hd_rxm = 0;
#ifdef	MCHECKSUM
			if (mp->m_crc != mesg_crc(mp)) {
				sprintf(pvmtxt,
						"netinpkt() message from t%x to t%x bad checksum\n",
						src, dst);
				pvmlogerror(pvmtxt);
	/* XXX must free message? */
				goto done;
			}
#endif
			mesg_rewind(mp);
			if (TIDISTASK(src)) {
				if (src == pvmschedtid) {
					schentry(mp);

				} else
					if (debugmask & (PDMMESSAGE|PDMAPPL)) {
						sprintf(pvmtxt,
						"netinpkt() mesg from t%x to t%x code %d scrapped\n",
								src, dst, mp->m_cod);
						pvmlogerror(pvmtxt);
	/* XXX must free message? */
					}

			} else {
				netentry(hp, mp);
			}
		}

	} else {								/* for a task */
		if (tp = task_find(dst)) {

#if defined(IMA_PGON) || defined(IMA_I860)
			if (TIDISNODE(dst))
				mpp_output(tp, pp);
			else
#endif
				pkt_to_task(tp, pp);
			pp = 0;

		} else {
			if (debugmask & (PDMPACKET|PDMAPPL)) {
				sprintf(pvmtxt,
						"netinpkt() pkt from t%x for t%x scrapped (no dst)\n",
						src, dst);
				pvmlogerror(pvmtxt);
	/* XXX must free message? */
			}
			goto done;
		}
	}

done:
	if (pp)
		pk_free(pp);
	return 0;
}


/*	loclconn()
*
*	Task has attempted to connect.  Accept the new connection and make
*	a blank context for it.
*/

loclconn()
{
	struct task *tp;			/* new task context */
	int i;
#ifndef NOUNIXDOM
	struct sockaddr_un uns;
#endif

	tp = task_new(0);

#ifdef NOUNIXDOM
	tp->t_salen = sizeof(tp->t_sad);

	if ((tp->t_sock = accept(loclsock, (struct sockaddr*)&tp->t_sad,
			&tp->t_salen)) == -1) {
		pvmlogperror("loclconn() accept");
		task_free(tp);
		tp = 0;

	} else {
		if (debugmask & (PDMPACKET|PDMTASK)) {
			sprintf(pvmtxt, "loclconn() accept from %s sock %d\n",
					inadport_decimal(&tp->t_sad), tp->t_sock);
			pvmlogerror(pvmtxt);
		}
#ifndef NOSOCKOPT
		i = 1;
		if (setsockopt(tp->t_sock, IPPROTO_TCP, TCP_NODELAY,
				(char*)&i, sizeof(int)) == -1) {
			pvmlogperror("loclconn() setsockopt");
		}
#endif
	}

#else /*NOUNIXDOM*/
	i = sizeof(uns);
	if ((tp->t_sock = accept(loclsock, (struct sockaddr*)&uns, &i)) == -1) {
		pvmlogperror("loclconn() accept");
		task_free(tp);
		tp = 0;

	} else {
		if (debugmask & (PDMPACKET|PDMTASK))
			pvmlogerror("loclconn() accept\n");
	}

#endif /*NOUNIXDOM*/

	if (tp) {
		if ((i = fcntl(tp->t_sock, F_GETFL, 0)) == -1)
			pvmlogperror("loclconn: fcntl");
		else {
#ifdef	IMA_RS6K
	/* did you ever feel as though your mind had started to erode? */
			i |= O_NONBLOCK;
#else	/*IMA_RS6K*/
#ifdef O_NDELAY
			i |= O_NDELAY;
#else
			i |= FNDELAY;
#endif
#endif	/*IMA_RS6K*/
			(void)fcntl(tp->t_sock, F_SETFL, i);
		}
		wrk_fds_add(tp->t_sock, 1);
	}

	return 0;
}


/*	locloutput()
*
*	Output to local task.  Sends packets until write() blocks.
*	Deletes task's bit from wrk_wfds if no more data to send.
*
*	Returns 0 if okay, else -1 if unrecoverable error.
*/

locloutput(tp)
	struct task *tp;
{
	struct pkt *pp;
	char *cp;
	int len;
	int n;

	while ((pp = tp->t_txq->pk_link)->pk_buf) {

		if (!pp->pk_cpos || pp->pk_cpos < pp->pk_dat) {
	/*
	* prepend frag [message] headers if we'll be writing them.
	*/
			cp = pp->pk_dat;
			len = pp->pk_len;
			if (pp->pk_flag & FFSOM) {
				cp -= TTMSGHDR;
				len += TTMSGHDR;
				if (cp < pp->pk_buf) {
					pvmlogerror("locloutput() no headroom for message header\n");
					return 0;
				}
				pvmput32(cp, pp->pk_cod);
				pvmput32(cp + 4, pp->pk_enc);
				pvmput32(cp + 8, pp->pk_wid);
				pvmput32(cp + 12, pp->pk_crc);
			}
			cp -= TDFRAGHDR;
			if (cp < pp->pk_buf) {
				pvmlogerror("locloutput() no headroom for packet header\n");
				return 0;
			}
			pvmput32(cp, pp->pk_dst);
			pvmput32(cp + 4, pp->pk_src);
			pvmput32(cp + 8, len);
			pvmput32(cp + 12, 0);			/* to keep purify happy */
			pvmput8(cp + 12, pp->pk_flag & (FFSOM|FFEOM));
			len += TDFRAGHDR;
		}

		if (pp->pk_cpos) {
			cp = pp->pk_cpos;
			len = pp->pk_len + (pp->pk_dat - cp);

		} else {
			pp->pk_cpos = cp;
			if (debugmask & PDMPACKET) {
				sprintf(pvmtxt,
					"locloutput() src t%x dst t%x f %s len %d\n",
					pp->pk_src, pp->pk_dst, pkt_flags(pp->pk_flag), len);
				pvmlogerror(pvmtxt);
			}
		}

	/*
	* send as much as possible; skip to next packet when all sent
	*/

#if defined(IMA_RS6K) || defined(IMA_SP2MPI)
		n = write(tp->t_sock, cp, min(len, 4096));
#else
		n = write(tp->t_sock, cp, len);
#endif

#ifdef	STATISTICS
		if (n == -1)
			stats.wrneg++;
		else
			if (!n)
				stats.wrzer++;
			else
				if (n == len)
					stats.wrok++;
				else
					stats.wrshr++;
#endif
		if (n == -1) {
			if (errno != EWOULDBLOCK
			&& errno != EINTR
			&& errno != EAGAIN
			&& errno != ENOBUFS) {
				pvmlogperror("locloutput() write");
				sprintf(pvmtxt, "locloutput() marking t%x dead\n",
						tp->t_tid);
				pvmlogerror(pvmtxt);
				return -1;
			}
			break;
		}

		if (n > 0) {
			if (debugmask & PDMPACKET) {
				sprintf(pvmtxt,
						"locloutput() src t%x dst t%x wrote %d\n",
						pp->pk_src, pp->pk_dst, n);
				pvmlogerror(pvmtxt);
			}
			if ((len - n) > 0) {
				pp->pk_cpos += n;

			} else {
#if defined(IMA_CM5) || defined(IMA_SP2MPI)
				int dst = pp->pk_dst;
#endif
				LISTDELETE(pp, pk_link, pk_rlink);
				pk_free(pp);
#if defined(IMA_CM5) || defined(IMA_SP2MPI)
				if (TIDISNODE(dst)) {
					struct task *tp2;

					/* Expensive! But what else can we do? */
					if ((tp2 = task_find(dst)) && (tp2->t_flag & TF_CLOSE)) {
						mpp_free(tp2);
						/* XXX task_cleanup(tp2); */
						task_free(tp2);
					}
				}
#endif /*defined(IMA_CM5) || defined(IMA_SP2MPI)*/
			}

		} else
			break;
	}

	if (tp->t_txq->pk_link == tp->t_txq) {
		wrk_fds_delete(tp->t_sock, 2);

	/* flush context if TF_CLOSE set */

		if (tp->t_flag & TF_CLOSE)
			return -1;
	}

	return 0;
}


/*	loclinput()
*
*	Input from a task.
*	Accept a packet and pass pkt to loclinpkt().
*	Returns 0 else -1 if error (work() should clean up the task context).
*/

loclinput(tp)
	struct task *tp;
{
	struct pkt *pp = 0;
	struct pkt *pp2;
	int n, m;

again:
	/*
	* if no current packet, start a new one
	*/

	if (!tp->t_rxp) {
		tp->t_rxp = pk_new(ourudpmtu);
/*
		tp->t_rxp = pk_new(TDFRAGHDR + 2);
*/
		if (DDFRAGHDR > TDFRAGHDR)
			tp->t_rxp->pk_dat += DDFRAGHDR - TDFRAGHDR;
	}
	pp = tp->t_rxp;

	/*
	* read the fragment header and body separately so we can
	* make a bigger buffer if needed
	*/

	n = (pp->pk_len < TDFRAGHDR) ? 0 : pvmget32(pp->pk_dat + 8);
	n += TDFRAGHDR - pp->pk_len;
	if (debugmask & PDMPACKET) {
		sprintf(pvmtxt, "loclinput() t%x fr_len=%d fr_dat=+%d n=%d\n",
				tp->t_tid, pp->pk_len, pp->pk_dat - pp->pk_buf, n);
		pvmlogerror(pvmtxt);
	}
	n = read(tp->t_sock, pp->pk_dat + pp->pk_len, n);
	if (debugmask & PDMPACKET) {
		if (n >= 0) {
			sprintf(pvmtxt, "loclinput() read=%d\n", n);
			pvmlogerror(pvmtxt);
		} else
			pvmlogperror("loclinput() read");
	}

#ifdef	STATISTICS
		switch (n) {
		case -1:
			stats.rdneg++;
			break;
		case 0:
			stats.rdzer++;
			break;
		default:
			stats.rdok++;
			break;
		}
#endif
	if (n == -1) {
		if (errno != EWOULDBLOCK && errno != EINTR) {
			pvmlogperror("loclinput() read");
			sprintf(pvmtxt, "loclinput() marking t%x dead\n",
					tp->t_tid);
			pvmlogerror(pvmtxt);
			return -1;
		}
		return 0;
	}
	if (!n) {
		if (debugmask & (PDMPACKET|PDMMESSAGE|PDMTASK)) {
			sprintf(pvmtxt, "loclinput() read EOF from t%x sock %d\n",
					tp->t_tid, tp->t_sock);
			pvmlogerror(pvmtxt);
		}
		return -1;
	}

	if ((pp->pk_len += n) < TDFRAGHDR)
		return 0;

	/*
	* if we have a complete frag, accept it
	*/

	m = TDFRAGHDR + pvmget32(pp->pk_dat + 8);
	if (pp->pk_len == m) {
		tp->t_rxp = 0;
		pp->pk_dst = pvmget32(pp->pk_dat);
#if defined(IMA_PGON) || defined(IMA_I860) || defined(IMA_CM5) || defined(IMA_SP2MPI)
		pp->pk_src = pvmget32(pp->pk_dat + 4);
#else
		pp->pk_src = tp->t_tid;
#endif
		pp->pk_flag = pvmget8(pp->pk_dat + 12);
		pp->pk_len -= TDFRAGHDR;
		pp->pk_dat += TDFRAGHDR;
		if (pp->pk_flag & FFSOM) {
			if (pp->pk_len < TTMSGHDR) {
				sprintf(pvmtxt,
						"loclinput() SOM pkt src t%x dst t%x too short\n",
						pp->pk_src, pp->pk_dst);
				pvmlogerror(pvmtxt);
				pk_free(pp);
				return 0;
			}
			pp->pk_cod = pvmget32(pp->pk_dat);
			pp->pk_enc = pvmget32(pp->pk_dat + 4);
			pp->pk_wid = pvmget32(pp->pk_dat + 8);
			pp->pk_crc = pvmget32(pp->pk_dat + 12);
			pp->pk_len -= TTMSGHDR;
			pp->pk_dat += TTMSGHDR;
		}
		if (loclinpkt(tp, pp))
			return -1;
		return 0;
	}

	/* realloc buffer if frag won't fit */

	if (pp->pk_len == TDFRAGHDR) {
		if (m > pp->pk_max - (pp->pk_dat - pp->pk_buf)) {
			if (!(tp->t_flag & TF_CONN)) {
				sprintf(pvmtxt,
					"loclinput() unconnected task sends frag length %d (ha)\n",
					m);
				pvmlogerror(pvmtxt);
				return -1;
			}
			if (DDFRAGHDR > TDFRAGHDR) {
				pp2 = pk_new(m + DDFRAGHDR - TDFRAGHDR);
				pp2->pk_dat += DDFRAGHDR - TDFRAGHDR;
			} else
				pp2 = pk_new(m);
			BCOPY(pp->pk_dat, pp2->pk_dat, TDFRAGHDR);
			pp2->pk_len = pp->pk_len;
			pk_free(pp);
			pp = tp->t_rxp = pp2;
			if (debugmask & PDMPACKET) {
				sprintf(pvmtxt, "loclinput() realloc frag max=%d\n", m);
				pvmlogerror(pvmtxt);
			}
		}
		goto again;
	}

	return 0;
}


/*	loclinpkt()
*
*	Consume pkt from task.
*	If it's for the pvmd it needs to be reassembled into a message.
*	If for a local or foreign task it needs to be put on a queue to be sent.
*	If for a remote pvmd, reassemble as for local then fwd whole message.
*	Returns 0 else -1 if error (work() should cleanup the
*	task context).
*/

loclinpkt(tp, pp)
	struct task *tp;
	struct pkt *pp;
{
	int dst;			/* pkt dst */
	int ff;				/* pkt flags */
	struct pkt *pp2;
	struct frag *fp;
	struct mesg *mp;
	struct hostd *hp;
	struct task *tp2;
#if defined(IMA_CM5) || defined(IMA_SP2MPI)
	struct task *socktp = tp;	/* owner of the socket */
#endif

	dst = pp->pk_dst;
	ff = pp->pk_flag;
	if (debugmask & PDMPACKET) {
		sprintf(pvmtxt,
				"loclinpkt() src t%x dst t%x f %s len %d\n",
				pp->pk_src, dst, pkt_flags(ff), pp->pk_len);
		pvmlogerror(pvmtxt);
	}

#ifdef IMA_SP2MPI
	if (pp->pk_src > 0 && !tp->t_tid && (tp2 = task_findpid(pp->pk_src))) {
		/* connect request from pvmhost */
		mpp_conn(tp, tp2);
		pk_free(pp);
		return -1;
	}
#endif
#if defined(IMA_PGON) || defined(IMA_I860) || defined(IMA_CM5) || defined(IMA_SP2MPI)
	if (TIDISNODE(pp->pk_src))		/* from a node */
		if (!(tp = task_find(pp->pk_src))) {
			sprintf(pvmtxt, "loclinpkt() from unknown task t%x\n", pp->pk_src);
			pvmlogerror(pvmtxt);
			goto done;
		}
#endif	/*defined(IMA_PGON) || defined(IMA_I860) || defined(IMA_CM5) || defined(IMA_SP2MPI)*/

	/*
	* if to multicast addr, replicate pkt in each q
	*/

	if (TIDISMCA(dst) && tp->t_mca && tp->t_mca->mc_tid == dst) {

		struct mca *mcap = tp->t_mca;
		int i;

		for (i = mcap->mc_ndst; i-- > 0; ) {
			dst = mcap->mc_dsts[i];
			if (hp = tidtohost(hosts, dst)) {
				pp2 = pk_new(0);
				pp2->pk_src = pp->pk_src;
				pp2->pk_dst = mcap->mc_tid;
				pp2->pk_flag = ff;
				pp2->pk_cod = pp->pk_cod;
				pp2->pk_enc = pp->pk_enc;
				pp2->pk_wid = pp->pk_wid;
				pp2->pk_crc = pp->pk_crc;
				pp2->pk_buf = pp->pk_buf;
				pp2->pk_max = pp->pk_max;
				pp2->pk_dat = pp->pk_dat;
				pp2->pk_len = pp->pk_len;
				da_ref(pp->pk_buf);

				if (hp->hd_hostpart == myhostpart) {
					netinpkt(hp, pp2);

				} else {
					pkt_to_host(hp, pp2);
				}

			} else
				if (debugmask & (PDMPACKET|PDMAPPL)) {
					sprintf(pvmtxt,
					"loclinpkt() pkt src t%x dst t%x scrapped (no such host)\n",
							pp->pk_src, dst);
					pvmlogerror(pvmtxt);
				}
		}

	/* free mca on last pkt */

		if (ff & FFEOM) {
			if (debugmask & PDMMESSAGE) {
				sprintf(pvmtxt, "loclinpkt() freed mca %x for t%x\n",
						mcap->mc_tid, tp->t_tid);
				pvmlogerror(pvmtxt);
			}
			mca_free(mcap);
			tp->t_mca = 0;
		}
		goto done;
	}

	/*
	* if to a pvmd, always reassemble (forward if not for us)
	*/

	if ((dst & ~tidhmask) == TIDPVMD) {
		if (ff & FFSOM) {			/* start of message */
			if (tp->t_rxm) {
				sprintf(pvmtxt, "loclinpkt() repeated start pkt t%x\n",
						tp->t_tid);
				pvmlogerror(pvmtxt);
				goto done;
			}
			tp->t_rxm = mesg_new(0);
			tp->t_rxm->m_cod = pp->pk_cod;
			tp->t_rxm->m_enc = pp->pk_enc;
			tp->t_rxm->m_wid = pp->pk_wid;
			tp->t_rxm->m_crc = pp->pk_crc;
			tp->t_rxm->m_dst = dst;
			tp->t_rxm->m_src = tp->t_tid;

		} else {					/* middle or end of message */
			if (!tp->t_rxm) {
				sprintf(pvmtxt,
					"loclinpkt() pkt with no message src t%x\n",
					tp->t_tid);
				pvmlogerror(pvmtxt);
				goto done;
			}
		}

		fp = fr_new(0);
		fp->fr_buf = pp->pk_buf;
		fp->fr_dat = pp->pk_dat;
		fp->fr_max = pp->pk_max;
		fp->fr_len = pp->pk_len;
		da_ref(pp->pk_buf);

		LISTPUTBEFORE(tp->t_rxm->m_frag, fp, fr_link, fr_rlink);
		tp->t_rxm->m_len += fp->fr_len;

		if (ff & FFEOM) {		/* end of message */
			mp = tp->t_rxm;
			tp->t_rxm = 0;
#ifdef	MCHECKSUM
			if (mp->m_crc != mesg_crc(mp)) {
				sprintf(pvmtxt,
						"loclinpkt() message src t%x dst t%x bad checksum\n",
						mp->m_src, dst);
				pvmlogerror(pvmtxt);
				goto done;
			}
#endif
			if (!(dst & tidhmask) || (dst & tidhmask) == myhostpart) {	/* local */
				mesg_rewind(mp);
				if (mp->m_cod >= (int)SM_FIRST && mp->m_cod <= (int)SM_LAST
				&& (mp->m_src == pvmschedtid || mp->m_src == hostertid || mp->m_src == taskertid))
				{
					schentry(mp);

				} else {
					loclentry(tp, mp);
				}

			} else {		/* remote */
				if (!tp->t_tid) {
					sprintf(pvmtxt, "loclinpkt() pkt src null dst t%x\n", dst);
					pvmlogerror(pvmtxt);
					goto done;
				}
				sendmessage(mp);
			}
	/*
	* if sock is -1, tm_conn2() wants us to throw out this context
	* because it's been merged into another.
	*/
#if defined(IMA_CM5) || defined(IMA_SP2MPI)
			/* node procs have no socket; they use pvmhost's */
			if (socktp->t_sock == -1)
#else
			if (tp->t_sock == -1)
#endif
			{
				pk_free(pp);
				return -1;
			}
		}
		goto done;
	}

	/*
	* if to a task, put in local or remote send queue
	*/

	if (TIDISTASK(dst)) {
		if (!tp->t_tid) {
			sprintf(pvmtxt, "loclinpkt() pkt src null dst t%x\n", dst);
			pvmlogerror(pvmtxt);
			goto done;
		}
		if (!(dst & tidhmask) || (dst & tidhmask) == myhostpart) {	/* local */
			if (tp2 = task_find(dst)) {

#if defined(IMA_PGON) || defined(IMA_I860)
				if (TIDISNODE(dst))
					mpp_output(tp2, pp);
				else
#endif
					pkt_to_task(tp2, pp);
/*
				LISTPUTBEFORE(tp2->t_txq, pp, pk_link, pk_rlink);
*/
				pp = 0;

			} else
				if (debugmask & (PDMPACKET|PDMAPPL)) {
					sprintf(pvmtxt,
					"loclinpkt() pkt src t%x dst t%x scrapped (no such task)\n",
							pp->pk_src, dst);
					pvmlogerror(pvmtxt);
				}

		} else {		/* remote host */

			if (hp = tidtohost(hosts, dst)) {
				pkt_to_host(hp, pp);
				pp = 0;

			} else {
				if (debugmask & (PDMPACKET|PDMAPPL)) {
					sprintf(pvmtxt,
					"loclinpkt() pkt src t%x dst t%x scrapped (no such host)\n",
							pp->pk_src, dst);
					pvmlogerror(pvmtxt);
				}
				goto done;
			}
		}
	}

done:
	if (pp)
		pk_free(pp);

	return 0;
}


/*	loclstout()
*
*	Read stdout/err pipe from a task.
*	Ship it to the output log tid if set, else send it to the master
*	pvmd to scribble in its log file.
*/

loclstout(tp)
	struct task *tp;
{
	int n;
	struct mesg *mp;

	static char buf[4000];

	n = read(tp->t_out, buf, sizeof(buf) - 1);
	if (n < 1) {
		if (n == 0 || (errno != EINTR && errno != EWOULDBLOCK)) {
			wrk_fds_delete(tp->t_out, 1);
			(void)close(tp->t_out);
			tp->t_out = -1;
			if (tp->t_outtid) {
				mp = mesg_new(0);
				mp->m_cod = tp->t_outcod;
				mp->m_dst = tp->t_outtid;
				pkint(mp, tp->t_tid);
				pkint(mp, 0);
				sendmessage(mp);
				tp->t_outtid = 0;
			}
		}

	} else {
		mp = mesg_new(0);
		pkint(mp, tp->t_tid);
		pkint(mp, n);
		bytepk(mp, buf, n, 1, 1);
		if (tp->t_outtid) {
			mp->m_cod = tp->t_outcod;
			mp->m_dst = tp->t_outtid;

		} else {
			mp->m_cod = DM_TASKOUT;
			mp->m_dst = hosts->ht_hosts[hosts->ht_cons]->hd_hostpart | TIDPVMD;
		}
		sendmessage(mp);
	}
	return 0;
}


/*	mesg_to_task()
*
*	Append a message to the send queue for a task.
*
*	N.B. Message must contain at least one frag or this will honk.
*/

int
mesg_to_task(tp, mp)
	struct task *tp;
	struct mesg *mp;
{
	struct frag *fp = mp->m_frag->fr_link;
	struct pkt *pp;
	int ff = FFSOM;			/* frag flags */
	int dst = mp->m_dst;

	if (debugmask & PDMMESSAGE) {
		sprintf(pvmtxt, "mesg_to_task() dst t%x code %s len %d\n",
				dst, pvmnametag(mp->m_cod, (int *)0), mp->m_len);
		pvmlogerror(pvmtxt);
	}

	/* if nothing yet in q, add task's sock to wrk_wfds */

	if (tp->t_sock >= 0)
		wrk_fds_add(tp->t_sock, 2);

	do {
		pp = pk_new(0);
		if (ff & FFSOM) {
			pp->pk_cod = mp->m_cod;
			pp->pk_enc = mp->m_enc;
			pp->pk_wid = mp->m_wid;
#ifdef	MCHECKSUM
			pp->pk_crc = mesg_crc(mp);
#else
			pp->pk_crc = 0;
#endif
		}
		pp->pk_buf = fp->fr_buf;
		pp->pk_dat = fp->fr_dat;
		pp->pk_max = fp->fr_max;
		pp->pk_len = fp->fr_len;
		da_ref(pp->pk_buf);
		if (fp->fr_link == mp->m_frag)
			ff |= FFEOM;
		pp->pk_src = TIDPVMD;
		pp->pk_dst = dst;
		pp->pk_flag = ff;
		ff = 0;
#if defined(IMA_PGON) || defined(IMA_I860)
		if (TIDISNODE(dst)) {
			mpp_output(tp, pp);
			continue;
		}
#endif
#ifdef SHMEM
		if (tp->t_sock < 0) {
			mpp_output(tp, pp);
			continue;
		}
#endif
		if (mp->m_flag & MM_PRIO) {
			LISTPUTAFTER(tp->t_txq, pp, pk_link, pk_rlink);
		} else {
			pkt_to_task(tp, pp);
/*
			LISTPUTBEFORE(tp->t_txq, pp, pk_link, pk_rlink);
*/
		}
	} while ((fp = fp->fr_link) != mp->m_frag);

	return 0;
}


/*	sendmessage()
*
*	Send a message.  If it's for a local task or remote host, cut
*	apart the fragments and queue to be sent.  If it's for the local
*	pvmd, just call netentry() with the whole message.
*
*	N.B. MM_PRIO only works for single-frag messages.
*/

int
sendmessage(mp)
	struct mesg *mp;
{
	struct hostd *hp = 0;
	struct task *tp;
	struct frag *fp;
	struct pkt *pp;
	int ff = FFSOM;
	int dst = mp->m_dst;

	if (!dst) {
		pvmlogerror("sendmessage() what? to t0\n");
	}

	if (debugmask & PDMMESSAGE) {
		sprintf(pvmtxt, "sendmessage() dst t%x code %s len %d\n",
				dst, pvmnametag(mp->m_cod, (int *)0), mp->m_len);
		pvmlogerror(pvmtxt);
	}

	/*
	*	add a frag to empty message to simplify handling
	*/

	if ((fp = mp->m_frag->fr_link) == mp->m_frag) {
		fp = fr_new(MAXHDR);
		fp->fr_dat += MAXHDR;
		LISTPUTBEFORE(mp->m_frag, fp, fr_link, fr_rlink);
	}

	/*
	*	route message
	*/

	if (!(dst & tidhmask) || (dst & tidhmask) == myhostpart) {	/* to local */

		if (TIDISTASK(dst)) {				/* to local task */

			if (tp = task_find(dst)) {
				mesg_to_task(tp, mp);

			} else
				if (debugmask & (PDMMESSAGE|PDMAPPL)) {
					sprintf(pvmtxt,
							"sendmessage() scrapped, no such task t%x\n",
							dst);
					pvmlogerror(pvmtxt);
				}

		} else {				/* to myself */
			mp->m_ref++;
			mesg_rewind(mp);
			netentry(hosts->ht_hosts[hosts->ht_local], mp);
		}

	} else {					/* to remote */

	/* lookup host */

		if (runstate == PVMDHTUPD)
			hp = tidtohost(newhosts, dst);
		if (!hp && !(hp = tidtohost(hosts, dst))) {
			if (debugmask & (PDMMESSAGE|PDMAPPL)) {
				sprintf(pvmtxt, "sendmessage() scrapped, no such host t%x\n",
						dst);
				pvmlogerror(pvmtxt);
			}
			goto bail;
		}

	/* packetize frags */

		do {
			pp = pk_new(0);
			if (ff & FFSOM) {
				pp->pk_cod = mp->m_cod;
				pp->pk_enc = mp->m_enc;
				pp->pk_wid = mp->m_wid;
#ifdef	MCHECKSUM
				pp->pk_crc = mesg_crc(mp);
#else
				pp->pk_crc = 0;
#endif
			}
			pp->pk_buf = fp->fr_buf;
			pp->pk_dat = fp->fr_dat;
			pp->pk_max = fp->fr_max;
			pp->pk_len = fp->fr_len;
			da_ref(pp->pk_buf);
			if (fp->fr_link == mp->m_frag)
				ff |= FFEOM;
			pp->pk_src = mp->m_src;
			pp->pk_dst = dst;
			pp->pk_flag = ff;
			ff = 0;
			if (mp->m_flag & MM_PRIO) {
				if (debugmask & (PDMMESSAGE|PDMAPPL))
					pvmlogerror("sendmessage() PRIO message to host? (scrapped)\n");

			} else {
				pkt_to_host(hp, pp);
			}
		} while ((fp = fp->fr_link) != mp->m_frag);
	}

bail:
	mesg_unref(mp);
	return 0;
}


/*	forkexec()
*
*	Search directories in epaths for given file.
*	Clean up any files we opened, fork and exec the named process.
*	Leave std{out,err} open so the process can whine if it needs to.
*
*	Returns 0 if ok (and fills in tpp), else returns PvmNoFile or
*	PvmOutOfRes
*
*	N.B. must be able to use argv[-1].
*/

int
forkexec(flags, name, argv, nenv, env, tpp)
	int flags;				/* exec options */
	char *name;				/* filename */
	char **argv;			/* arg list (argv[-1] must be there) */
	int nenv;				/* num of envars */
	char **env;				/* envars */
	struct task **tpp;		/* return task context */
{
	int tid;				/* task tid */
	int pid;				/* task pid */
	int pfd[2];				/* pipe back from task */
	struct task *tp;		/* new task context */
	char path[MAXPATHLEN];
	struct stat sb;
	char **ep, **eplist;
	int i;
	struct mesg *mp;		/* message to tasker */
	struct waitc *wp;
	int ac;
	int realrunstate;
	char buf[32];

	static char *nullep[] = { "", 0 };
	static int nextfakepid = 10000000;		/* XXX fix this */

	if ((tid = tid_new()) < 0) {
		pvmlogerror("forkexec() out of tids?\n");
		return PvmOutOfRes;
	}
	tp = task_new(tid);

	/* search for file */

	eplist = CINDEX(name, '/') ? nullep : epaths;

	for (ep = eplist; *ep; ep++) {
		(void)strcpy(path, *ep);
		if (path[0])
			(void)strcat(path, "/");
		(void)strncat(path, name, sizeof(path) - strlen(path) - 1);

		if (stat(path, &sb) == -1
				|| ((sb.st_mode & S_IFMT) != S_IFREG)
				|| !(sb.st_mode & S_IEXEC)) {
			if (debugmask & PDMTASK) {
				sprintf(pvmtxt, "forkexec() stat failed <%s>\n", path);
				pvmlogerror(pvmtxt);
			}
			continue;
		}

		if (taskertid) {
			mp = mesg_new(0);
			mp->m_cod = SM_STTASK;
			mp->m_dst = taskertid;
			pkint(mp, tid);
			pkint(mp, flags);
			pkstr(mp, path);
			for (ac = 1; argv[ac]; ac++) ;
			pkint(mp, ac);
			pkstr(mp, path);
			for (i = 1; i < ac; i++)
				pkstr(mp, argv[i]);
			pkint(mp, nenv + 1);
			sprintf(buf, "PVMEPID=%d", nextfakepid);
			pkstr(mp, buf);
			task_setpid(tp, nextfakepid);
			if (++nextfakepid > 20000000)
				nextfakepid = 10000000;
			for (i = 0; i < nenv; i++)
				pkstr(mp, env[i]);
			wp = wait_new(WT_TASKSTART);
			wp->wa_tid = tid;
			wp->wa_on = taskertid;
			mp->m_wid = wp->wa_wid;
			sendmessage(mp);
			if (debugmask & PDMTASK) {
				sprintf(pvmtxt, "forkexec() sent tasker t%x pid %d\n",
						tp->t_tid, tp->t_pid);
				pvmlogerror(pvmtxt);
			}

		} else {
#ifdef	IMA_TITN
			if (socketpair(AF_UNIX, SOCK_STREAM, 0, pfd) == -1) {
				pvmlogperror("forkexec() socketpair");
				task_free(tp);
				return PvmOutOfRes;
			}
#else
			if (pipe(pfd) == -1) {
				pvmlogperror("forkexec() pipe");
				task_free(tp);
				return PvmOutOfRes;
			}
#endif

	/*
	* switch runstate to is-task before forking to avoid race.
	* if we're killed as a task, we don't want to clean up pvmd stuff.
	*/
			realrunstate = runstate;
			runstate = PVMDISTASK;

#if defined(IMA_CSPP) && defined(BALANCED_SPAWN)
			pid = cnx_sc_fork(CNX_INHERIT_SC, (int) __get_node_id());
#else
			pid = fork();
#endif
			if (pid)
				runstate = realrunstate;

			if (!pid) {

	/* close any random fds */

				dup2(pfd[1], 1);
				dup2(1, 2);
				for (i = getdtablesize(); --i > 2; )
					(void)close(i);
	/*
	* set envars
	*/
				while (nenv-- > 0) {
					pvmputenv(env[nenv]);
/*
					sprintf(pvmtxt, "forkexec() putenv(%s)\n", env[nenv]);
					pvmlogerror(pvmtxt);
*/
				}
	/*
	* put expected pid in environment for libpvm in case
	* the process we exec forks before connecting back to the pvmd
	*/
				sprintf(buf, "PVMEPID=%d", getpid());
				pvmputenv(buf);
				argv[0] = path;
				if (flags & PvmTaskDebug) {
					char *p;

					argv--;
					if (p = getenv("PVM_DEBUGGER"))
						argv[0] = p;
					else
						argv[0] = debugger;
					execv(argv[0], argv);

				} else {
					execv(path, argv);
				}
				exit(1);
			}
			if (pid == -1) {
				pvmlogperror("forkexec() fork");
				(void)close(pfd[0]);
				(void)close(pfd[1]);
				task_free(tp);
				return PvmOutOfRes;
			}
			(void)close(pfd[1]);

			task_setpid(tp, pid);
			tp->t_out = pfd[0];
			tp->t_flag |= TF_FORKD;

			wrk_fds_add(tp->t_out, 1);
			if (debugmask & PDMTASK) {
				sprintf(pvmtxt, "forkexec() new task t%x pid %d pfd=%d\n",
						tp->t_tid, tp->t_pid, tp->t_out);
				pvmlogerror(pvmtxt);
			}
		}

		tp->t_a_out = STRALLOC(name);
		*tpp = tp;
		return 0;
	}
	if (debugmask & PDMTASK) {
		sprintf(pvmtxt, "forkexec() didn't find <%s>\n", name);
		pvmlogerror(pvmtxt);
	}
	task_free(tp);
	return PvmNoFile;
}


/*	beprime()
*
*	Pvmd[master] becomes pvmd'[master].
*	Set runstate, make ppnetsock the real netsock, close loclsock.
*/

beprime()
{
	struct htab *htp;
	struct task *tp;
	int i;

	runstate = PVMDPRIME;

	if ((myunixpid = getpid()) == -1) {
		pvmlogerror("beprime() can't getpid()\n");
		pvmbailout(0);
	}

	myhostpart = 0;
	pvmmytid = TIDPVMD;

	htp = ht_new(hosts->ht_local);
	htp->ht_master = hosts->ht_local;
	htp->ht_local = 0;
	ht_insert(htp, hosts->ht_hosts[hosts->ht_local]);
	ht_insert(htp, hosts->ht_hosts[0]);
	htp->ht_hosts[htp->ht_master]->hd_txseq
			= htp->ht_hosts[0]->hd_rxseq;
	htp->ht_hosts[htp->ht_master]->hd_rxseq
			= htp->ht_hosts[0]->hd_txseq;

	oldhosts = hosts;
	hosts = htp;

#ifndef NOUNIXDOM
	loclspath = 0;
#endif
	(void)close(loclsock);
	loclsock = -1;
	loclsnam = 0;
	(void)close(netsock);
	netsock = ppnetsock;
	ppnetsock = -1;

	locltasks = 0;
	task_init();

	/* close everything but netsock, log_fd and 0, 1, 2 */

	for (i = getdtablesize(); --i > 2; )
		if (i != netsock && i != log_fd)
			(void)close(i);

	wrk_fds_init();
	wrk_fds_add(netsock, 1);

	opq = pk_new(0);
	opq->pk_tlink = opq->pk_trlink = opq;

	wdead = 0;
	rdead = 0;

	return 0;
}


/*	pkt_to_host()
*
*	Add data pkt to send queue (txq) for a host.  Consume the pkt.
*	If data plus header length is greater than host mtu,
*	refragment into >1 pkts.
*
*	We have to pay special attention to the FFSOM packet - make it
*	shorter so there's room to prepend the message header later.
*
*	If send window to host has room, push packet to opq.
*/

int
pkt_to_host(hp, pp)
	struct hostd *hp;
	struct pkt *pp;
{
	int maxl = (hp->hd_mtu < ourudpmtu ? hp->hd_mtu : ourudpmtu) - DDFRAGHDR;
	int llim = pp->pk_flag & FFSOM ? maxl - TTMSGHDR : maxl;

	pp->pk_flag = (pp->pk_flag & (FFSOM|FFEOM)) | FFDAT;
	if (debugmask & PDMPACKET) {
		sprintf(pvmtxt, "pkt_to_host() pkt src t%x dst t%x f %s len %d\n",
				pp->pk_src, pp->pk_dst, pkt_flags(pp->pk_flag), pp->pk_len);
		pvmlogerror(pvmtxt);
	}

	if (pp->pk_len <= llim) {
		LISTPUTBEFORE(hp->hd_txq, pp, pk_link, pk_rlink);

	} else {
		struct pkt *pp2;
		char *cp = pp->pk_dat;
		int togo;
		int n;
		int ff = pp->pk_flag & FFSOM;
		int fe = pp->pk_flag & FFEOM;

		for (togo = pp->pk_len; togo > 0; togo -= n) {
			n = min(togo, llim);
			if ((debugmask & PDMPACKET) && togo != pp->pk_len) {
				sprintf(pvmtxt, "pkt_to_host() refrag len %d\n", n);
				pvmlogerror(pvmtxt);
			}
#ifdef	STATISTICS
			stats.refrag++;
#endif
			pp2 = pk_new(0);
			pp2->pk_src = pp->pk_src;
			pp2->pk_dst = pp->pk_dst;
			if (n == togo)
				ff |= fe;
			pp2->pk_flag = ff | FFDAT;
			ff = 0;
			llim = maxl;
			pp2->pk_cod = pp->pk_cod;
			pp2->pk_enc = pp->pk_enc;
			pp2->pk_wid = pp->pk_wid;
			pp2->pk_crc = pp->pk_crc;
			pp2->pk_buf = pp->pk_buf;
			pp2->pk_max = pp->pk_max;
			pp2->pk_dat = cp;
			pp2->pk_len = n;
			da_ref(pp->pk_buf);
			cp += n;
			LISTPUTBEFORE(hp->hd_txq, pp2, pk_link, pk_rlink);
		}
		pk_free(pp);
	}

	while (hp->hd_nop < nopax
	&& (hp->hd_txq->pk_link != hp->hd_txq)) {
		if (debugmask & PDMPACKET) {
			sprintf(pvmtxt, "pkt_to_host() pkt to opq\n");
			pvmlogerror(pvmtxt);
		}
		pp = hp->hd_txq->pk_link;
		LISTDELETE(pp, pk_link, pk_rlink);
		TVCLEAR(&pp->pk_rtv);
		TVXADDY(&pp->pk_rta, &hp->hd_rtt, &hp->hd_rtt);
		TVCLEAR(&pp->pk_rto);
		TVCLEAR(&pp->pk_at);
		pp->pk_nrt = 0;
		pp->pk_hostd = hp;
		pp->pk_seq = hp->hd_txseq;
		hp->hd_txseq = NEXTSEQNUM(hp->hd_txseq);
		pp->pk_ack = 0;
		LISTPUTBEFORE(hp->hd_opq, pp, pk_link, pk_rlink);
		hp->hd_nop++;
		LISTPUTAFTER(opq, pp, pk_tlink, pk_trlink);
	}
	return 0;
}


int
fin_to_host(hp)
	struct hostd *hp;
{
	struct pkt *pp;

	if (debugmask & PDMPACKET) {
		sprintf(pvmtxt, "fin_to_host() %s\n", hp->hd_name);
		pvmlogerror(pvmtxt);
	}
	pp = pk_new(DDFRAGHDR);	/* XXX could reref a dummy databuf here */
	pp->pk_dat += DDFRAGHDR;
	pp->pk_dst = hp->hd_hostpart | TIDPVMD;
	pp->pk_src = pvmmytid;
	pp->pk_flag = FFFIN;
	TVCLEAR(&pp->pk_rtv);
	TVXADDY(&pp->pk_rta, &hp->hd_rtt, &hp->hd_rtt);
	TVCLEAR(&pp->pk_rto);
	TVCLEAR(&pp->pk_at);
	pp->pk_nrt = 0;
	pp->pk_hostd = hp;
	pp->pk_seq = hp->hd_txseq;
	hp->hd_txseq = NEXTSEQNUM(hp->hd_txseq);
	pp->pk_ack = 0;
	LISTPUTBEFORE(hp->hd_opq, pp, pk_link, pk_rlink);
	hp->hd_nop++;
	LISTPUTAFTER(opq, pp, pk_tlink, pk_trlink);
	return 0;
}


int
finack_to_host(hp)
	struct hostd *hp;
{
	struct pkt *pp;

	if (debugmask & PDMPACKET) {
		sprintf(pvmtxt, "finack_to_host() %s\n", hp->hd_name);
		pvmlogerror(pvmtxt);
	}
	pp = pk_new(DDFRAGHDR);	/* XXX could reref a dummy databuf here */
	pp->pk_dat += DDFRAGHDR;
	pp->pk_dst = hp->hd_hostpart | TIDPVMD;
	pp->pk_src = pvmmytid;
	pp->pk_flag = FFFIN|FFACK;
	TVCLEAR(&pp->pk_rtv);
	TVCLEAR(&pp->pk_rta);
	TVCLEAR(&pp->pk_rto);
	TVCLEAR(&pp->pk_at);
	pp->pk_nrt = 0;
	pp->pk_hostd = hp;
	pp->pk_seq = 0;
	pp->pk_ack = 0;
	LISTPUTAFTER(opq, pp, pk_tlink, pk_trlink);
	return 0;
}


/*	pkt_to_task()
*
*	Add data pkt to send queue (txq) for a task.  Consume the pkt.
*	If data plus header length is greater than task mtu,
*	refragment into >1 pkts.
*/

int
pkt_to_task(tp, pp)
	struct task *tp;
	struct pkt *pp;
{
	if (tp->t_sock >= 0 && (tp->t_flag & TF_CONN))
		wrk_fds_add(tp->t_sock, 2);

#if defined(IMA_PGON) || defined(IMA_I860)
	if (TIDISNODE(pp->pk_dst))
		mpp_output(tp, pp);
	else
#endif
#ifdef SHMEM
	if (tp->t_sock < 0)
		mpp_output(tp, pp);
	else
#endif

#ifdef LocalRefragmentTest
	if (pp->pk_len + DDFRAGHDR <= ourudpmtu) {
		LISTPUTBEFORE(tp->t_txq, pp, pk_link, pk_rlink);

	} else {
		struct pkt *pp2;
		int maxl = ourudpmtu - DDFRAGHDR;
		char *cp = pp->pk_dat;
		int togo;
		int n;
		int ff = pp->pk_flag & FFSOM;
		int fe = pp->pk_flag & FFEOM;

		for (togo = pp->pk_len; togo > 0; togo -= n) {
			n = min(togo, maxl);
			sprintf(pvmtxt, "pkt_to_task() refrag len %d\n", n);
			pvmlogerror(pvmtxt);
			pp2 = pk_new(0);
			pp2->pk_src = pp->pk_src;
			pp2->pk_dst = pp->pk_dst;
			if (n == togo)
				ff |= fe;
			pp2->pk_flag = ff | FFDAT;
			ff = 0;
			pp2->pk_cod = pp->pk_cod;
			pp2->pk_enc = pp->pk_enc;
			pp2->pk_wid = pp->pk_wid;
			pp2->pk_crc = pp->pk_crc;
			pp2->pk_buf = pp->pk_buf;
			pp2->pk_max = pp->pk_max;
			pp2->pk_dat = cp;
			pp2->pk_len = n;
			da_ref(pp->pk_buf);
			cp += n;
			LISTPUTBEFORE(tp->t_txq, pp2, pk_link, pk_rlink);
		}
		pk_free(pp);
	}
#else /*LocalRefragmentTest*/
	LISTPUTBEFORE(tp->t_txq, pp, pk_link, pk_rlink);
#endif /*LocalRefragmentTest*/

	return 0;
}


#ifdef	STATISTICS
dump_statistics()
{
	sprintf(pvmtxt, " select: rdy %d, zero %d, neg %d\n",
			stats.selrdy, stats.selzer, stats.selneg);
	pvmlogerror(pvmtxt);
	sprintf(pvmtxt, " sendto: ok %d, neg %d  recvfrom: ok %d\n",
			stats.sdok, stats.sdneg, stats.rfok);
	pvmlogerror(pvmtxt);
	sprintf(pvmtxt, " read: pos %d, zero %d, neg %d\n",
			stats.rdok, stats.rdzer, stats.rdneg);
	pvmlogerror(pvmtxt);
	sprintf(pvmtxt, " write: ok %d, short %d, zero %d, neg %d\n",
			stats.wrok, stats.wrshr, stats.wrzer, stats.wrneg);
	pvmlogerror(pvmtxt);
	sprintf(pvmtxt, " refrags: %d\n", stats.refrag);
	pvmlogerror(pvmtxt);
	sprintf(pvmtxt, " netwk resends: %d\n", stats.netret);
	pvmlogerror(pvmtxt);
	return 0;
}


reset_statistics()
{
	BZERO((char*)&stats, sizeof(stats));
	return 0;
}
#endif	/*STATISTICS*/


#if defined(IMA_CSPP) && defined(BALANCED_SPAWN)
static int number_nodes = -1;
static int number_cpus = -1;
static cnx_scid_t scid_num;

static int
__get_node_id()
{
	static int current_node = 0;
	static int current_cpu = 0;

	if (number_nodes == -1) {
		number_nodes = get_number_nodes();
		number_cpus = get_number_cpus(current_node);
		goto done;
	}

	if (number_nodes == 1) {
		goto done;
	}

	if (current_cpu < (number_cpus - 1) ) {
		current_cpu++;

	} else {
		current_cpu = 0;
		if (current_node < (number_nodes - 1)) {
			current_node++;
			number_cpus = get_number_cpus(current_node);
		} else {
			current_node = 0;
			number_cpus = get_number_cpus(current_node);
			if (debugmask & PDMTASK) {
				pvmlogerror (
					"Warning:pvm_spawn restarting process placement on Node 0");
			}
		}
	}
done:
	return current_node;
}


static cnx_is_scnode_basic_info_data_t sc_info[CNX_MAX_NODES];

static int
get_number_nodes()
{
	cnx_is_target_data_t target;
	int ret;
	cnx_pattributes_t pattr;
	int val;
	char errortxt[128];

	cnx_getpattr(getpid(), CNX_PATTR_SCID, &pattr);
	scid_num = pattr.pattr_scid;
	cnx_sysinfo_target_scnode(&target, scid_num, CNX_IS_ALL_NODES);
	ret = cnx_sysinfo(
		CNX_IS_SCNODE_BASIC_INFO,
		(void *) &target,
		sc_info,
		CNX_MAX_NODES,
		CNX_IS_SCNODE_BASIC_INFO_COUNT,
		(unsigned *) &val);

	if (ret == -1) {
		sprintf(errortxt,
				"Error calling cnx_sysinfo in %s:line %d errno: %d \n",
				__FILE__, __LINE__, errno);
		pvmlogerror(errortxt);
		exit (-1);
	}
	return val;
}

static int
get_number_cpus(int current_node)
{
	return sc_info[current_node].num_cpus;
}

#endif /*defined(IMA_CSPP) && defined(BALANCED_SPAWN)*/


