Newsgroups: comp.parallel.pvm
From: pelegrin@labri.u-bordeaux.fr (Francois PELLEGRINI)
Subject: PVM 3.2.6 problem on SP1 : messages sent to wrong processes
Keywords: fortran, pvm, messages, bad addressing
Organization: LaBRI - Laboratoire Bordelais de Recherche en Informatique, Bordeaux, France
Date: 27 Aug 1994 20:51:07 GMT
Message-ID: <33o8vr$gc7@serveur.cribx1.u-bordeaux.fr>


	Hello.

	I have a rather strange problem when running the public-domain
PVM 3.2.6 on an IBM SP1 machine. I have compiled PVM on an IBM 990 machine,
and the PV machine is made of the nodes of the SP1.
As far as I can see, it seems that processes receive messages that are
not sent to them.

	To illustrate this erroneous behavior, I have put below a simple
Fortran + PVM program, as well as its resulting log files. The goal of the
program is to perform data exchange between pair of processes, e.g.
processes 1<-->2, 3<-->4, and so on. The messages contain the number of
their sender, and the message tags are set to 600 plus this number.
The received messages are coherent, but sometimes they are received by
the wrong process.

	I have thought that I may have made a Fortran mistake which would
result in garbling PVM, but I have been unable to see it. Either it's too
big and I am so shortsighted that I do not see it, or there is something
strange happening...

	Any help would be strongly appreciated.

	Thanks in advance.

				f.p.
##
## Here is the Fortran program
##
      PROGRAM PRINCIPAL
       IMPLICIT NONE
       INCLUDE 'fpvm3.h'
       INTEGER COMTIDS(1024)
       INTEGER COMNBR
       INTEGER COMNUM,COMTID
       INTEGER COMPEER
       INTEGER RCVBUF,RCVSIZ,RCVTAG,RCVTID,RCVVAL
       INTEGER COMREP
       INTEGER I

       PRINT *,'X *** HELLO ***'

       CALL PVMFMYTID (COMTID)
       IF ( COMTID.LT.0 ) THEN
          PRINT *,'X --- pvmd unreachable'
          STOP
       END IF
       CALL PVMFJOINGROUP ('COMGROUP',COMNUM)
       IF ( COMNUM.LT.0 ) THEN
          PRINT *,'X --- Cannot access group',COMNUM
          CALL PVMFEXIT (COMREP)
          STOP
       END IF
       COMNUM = COMNUM + 1

       IF ( COMNUM.EQ.1 ) THEN

C We are the initial process

 100      PRINT *,'1 Enter number of processes (multiple of 2) '
          READ (*,*) COMNBR
          IF ( COMNBR.LT.1 ) THEN
             print*,'1 Incorrect number of processes. Retry.'
             GOTO 100
          END IF

C Launch the other processes

          COMTIDS(1) = COMTID
          CALL PVMFSPAWN ('testpvm',PVMDEFAULT,'*',
     &                    COMNBR-1,COMTIDS(2),COMREP)
          IF ( COMREP.LT.(COMNBR-1) ) THEN
             PRINT *,'1 --- Cannot launch',COMREP
             CALL PVMFEXIT (COMREP)
             STOP
          END IF

C Broadcast the TID array

          CALL PVMFINITSEND (PVMDEFAULT,COMREP)
          CALL PVMFPACK     (INTEGER4,COMNBR,1,1,COMREP)
          CALL PVMFPACK     (INTEGER4,COMTIDS,COMNBR,1,COMREP)
          CALL PVMFMCAST    (COMNBR-1,COMTIDS(2),0,COMREP)
       ELSE

C We are the launched processes

          CALL PVMFGETTID ('COMGROUP',0,COMTIDS(1))
          CALL PVMFRECV   (COMTIDS(1),0,COMREP)
          CALL PVMFUNPACK (INTEGER4,COMNBR,1,1,COMREP)
          CALL PVMFUNPACK (INTEGER4,COMTIDS,COMNBR,1,COMREP)
       END IF          

       PRINT *,COMNUM,'*** TID LIST ***'
       DO I=1,COMNBR
          PRINT *,COMNUM,' COMNBRC=',I,' TID=',COMTIDS(I)
       END DO
       PRINT *,COMNUM,'*** BEFORE TEST ***'

       CALL PVMFBARRIER ('COMGROUP',COMNBR,COMREP)

       IF ( (COMNUM / 2).NE.((COMNUM - 1) / 2) ) THEN

C Even instance number (2, 4, 6, ...)

          COMPEER = COMNUM - 1

          CALL PVMFINITSEND (PVMDEFAULT,COMREP)
          CALL PVMFPACK (INTEGER4,COMNUM,1,1,COMREP)
          CALL PVMFSEND (COMTIDS(COMPEER),600+COMNUM,COMREP)
          CALL PVMFRECV (-1,-1,RCVBUF)
          CALL PVMFBUFINFO (RCVBUF,RCVSIZ,RCVTAG,RCVTID,COMREP)
          CALL PVMFUNPACK (INTEGER4,RCVVAL,1,1,COMREP)
       ELSE

C Odd instance number (1, 3, 5, ...)

          COMPEER = COMNUM + 1

          CALL PVMFRECV (-1,-1,RCVBUF)
          CALL PVMFBUFINFO (RCVBUF,RCVSIZ,RCVTAG,RCVTID,COMREP)
          CALL PVMFUNPACK (INTEGER4,RCVVAL,1,1,COMREP)
          CALL PVMFINITSEND (PVMDEFAULT,COMREP)
          CALL PVMFPACK (INTEGER4,COMNUM,1,1,COMREP)
          CALL PVMFSEND (COMTIDS(COMPEER),600+COMNUM,COMREP)
       END IF

       IF ( RCVBUF.LT.1 ) THEN
         PRINT *,COMNUM,'--- Bad receive buffer'
       END IF

       PRINT *,COMNUM,' COMPEER=',COMPEER,' RCVTAG=',RCVTAG,
     &       ' RCVVAL=',RCVVAL

       IF ( RCVTAG.NE.(RCVVAL+600) ) THEN
         PRINT *,COMNUM,'--- Incoherent message',
     &         RCVTAG,RCVVAL
       END IF
       IF ( COMPEER.NE.RCVVAL ) THEN
         PRINT *,COMNUM,'--- Misreceived message (1)',
     &         COMPEER,RCVVAL
       END IF
       IF ( COMTIDS(COMPEER).NE.RCVTID ) THEN
         PRINT *,COMNUM,'--- Misreceived message (2)',
     &         COMTIDS(COMPEER),RCVTID
       END IF

       CALL PVMFBARRIER ('COMGROUP',COMNBR,COMREP)

       PRINT *,COMNUM,'*** AFTER TEST ***'

       CALL PVMFLVGROUP ('COMGROUP',COMREP)
       CALL PVMFEXIT (COMREP)

       END
##
##  The resulting text console output
##

./testpvm 
 X *** HELLO ***
 1 Enter number of processes (multiple of 2) 
8
 1 *** TID LIST ***
 1  COMNBRC= 1  TID= 262147
 1  COMNBRC= 2  TID= 1048577
 1  COMNBRC= 3  TID= 1310721
 1  COMNBRC= 4  TID= 1572865
 1  COMNBRC= 5  TID= 1835009
 1  COMNBRC= 6  TID= 2097153
 1  COMNBRC= 7  TID= 2359297
 1  COMNBRC= 8  TID= 2621441
 1 *** BEFORE TEST ***
 1  COMPEER= 2  RCVTAG= 602  RCVVAL= 2
 1 --- Misreceived message (2) 1048577 2097153

##
##  The resulting PVMD log file
##

[t80040000] ready   Fri Aug 26 14:30:51 1994
[t80040000] dm_addhost() already adding new hosts, oops
[t80040000] dm_addhost() already adding new hosts, oops
[t80040000] dm_addhost() already adding new hosts, oops
[t80040000] dm_addhost() already adding new hosts, oops
[t80040000] [t100001]  X *** HELLO ***
[t80040000] [t200001]  X *** HELLO ***
[t80040000] [t1c0001]  X *** HELLO ***
[t80040000] [t140001]  X *** HELLO ***
[t80040000] [t240001]  X *** HELLO ***
[t80040000] [t280001]  X *** HELLO ***
[t80040000] [t180001]  X *** HELLO ***
[t80040000] [t200001]  2 *** TID LIST ***
[t80040000] [t200001]  2  COMNBRC= 1  TID= 262147
[t80040000] [t100001]  3 *** TID LIST ***
[t80040000] [t100001]  3  COMNBRC= 1  TID= 262147
[t80040000] [t200001]  2  COMNBRC= 2  TID= 1048577
[t80040000] [t100001]  3  COMNBRC= 2  TID= 1048577
[t80040000] [t200001]  2  COMNBRC= 3  TID= 1310721
[t80040000] [t100001]  3  COMNBRC= 3  TID= 1310721
[t80040000] [t200001]  2  COMNBRC= 4  TID= 1572865
[t80040000] [t100001]  3  COMNBRC= 4  TID= 1572865
[t80040000] [t200001]  2  COMNBRC= 5  TID= 1835009
[t80040000] [t100001]  3  COMNBRC= 5  TID= 1835009
[t80040000] [t200001]  2  COMNBRC= 6  TID= 2097153
[t80040000] [t100001]  3  COMNBRC= 6  TID= 2097153
[t80040000] [t200001]  2  COMNBRC= 7  TID= 2359297
[t80040000] [t140001]  4 *** TID LIST ***
[t80040000] [t140001]  4  COMNBRC= 1  TID= 262147
[t80040000] [t140001]  4  COMNBRC= 2  TID= 1048577
[t80040000] [t140001]  4  COMNBRC= 3  TID= 1310721
[t80040000] [t140001]  4  COMNBRC= 4  TID= 1572865
[t80040000] [t140001]  4  COMNBRC= 5  TID= 1835009
[t80040000] [t140001]  4  COMNBRC= 6  TID= 2097153
[t80040000] [t140001]  4  COMNBRC= 7  TID= 2359297
[t80040000] [t140001]  4  COMNBRC= 8  TID= 2621441
[t80040000] [t140001]  4 *** BEFORE TEST ***
[t80040000] [t100001]  3  COMNBRC= 7  TID= 2359297
[t80040000] [t200001]  2  COMNBRC= 8  TID= 2621441
[t80040000] [t100001]  3  COMNBRC= 8  TID= 2621441
[t80040000] [t200001]  2 *** BEFORE TEST ***
[t80040000] [t100001]  3 *** BEFORE TEST ***
[t80040000] [t180001]  6 *** TID LIST ***
[t80040000] [t180001]  6  COMNBRC= 1  TID= 262147
[t80040000] [t1c0001]  5 *** TID LIST ***
[t80040000] [t1c0001]  5  COMNBRC= 1  TID= 262147
[t80040000] [t1c0001]  5  COMNBRC= 2  TID= 1048577
[t80040000] [t1c0001]  5  COMNBRC= 3  TID= 1310721
[t80040000] [t1c0001]  5  COMNBRC= 4  TID= 1572865
[t80040000] [t1c0001]  5  COMNBRC= 5  TID= 1835009
[t80040000] [t1c0001]  5  COMNBRC= 6  TID= 2097153
[t80040000] [t1c0001]  5  COMNBRC= 7  TID= 2359297
[t80040000] [t1c0001]  5  COMNBRC= 8  TID= 2621441
[t80040000] [t1c0001]  5 *** BEFORE TEST ***
[t80040000] [t180001]  6  COMNBRC= 2  TID= 1048577
[t80040000] [t240001]  7 *** TID LIST ***
[t80040000] [t240001]  7  COMNBRC= 1  TID= 262147
[t80040000] [t240001]  7  COMNBRC= 2  TID= 1048577
[t80040000] [t240001]  7  COMNBRC= 3  TID= 1310721
[t80040000] [t240001]  7  COMNBRC= 4  TID= 1572865
[t80040000] [t240001]  7  COMNBRC= 5  TID= 1835009
[t80040000] [t240001]  7  COMNBRC= 6  TID= 2097153
[t80040000] [t240001]  7  COMNBRC= 7  TID= 2359297
[t80040000] [t240001]  7  COMNBRC= 8  TID= 2621441
[t80040000] [t240001]  7 *** BEFORE TEST ***
[t80040000] [t180001]  6  COMNBRC= 3  TID= 1310721
[t80040000] [t180001]  6  COMNBRC= 4  TID= 1572865
[t80040000] [t180001]  6  COMNBRC= 5  TID= 1835009
[t80040000] [t180001]  6  COMNBRC= 6  TID= 2097153
[t80040000] [t180001]  6  COMNBRC= 7  TID= 2359297
[t80040000] [t180001]  6  COMNBRC= 8  TID= 2621441
[t80040000] [t180001]  6 *** BEFORE TEST ***
[t80040000] [t280001]  8 *** TID LIST ***
[t80040000] [t280001]  8  COMNBRC= 1  TID= 262147
[t80040000] [t280001]  8  COMNBRC= 2  TID= 1048577
[t80040000] [t280001]  8  COMNBRC= 3  TID= 1310721
[t80040000] [t280001]  8  COMNBRC= 4  TID= 1572865
[t80040000] [t280001]  8  COMNBRC= 5  TID= 1835009
[t80040000] [t280001]  8  COMNBRC= 6  TID= 2097153
[t80040000] [t280001]  8  COMNBRC= 7  TID= 2359297
[t80040000] [t280001]  8  COMNBRC= 8  TID= 2621441
[t80040000] [t280001]  8 *** BEFORE TEST ***
[t80040000] [t240001]  7  COMPEER= 8  RCVTAG= 608  RCVVAL= 8
[t80040000] [t1c0001]  5  COMPEER= 6  RCVTAG= 606  RCVVAL= 6
[t80040000] [t1c0001]  5 --- Misreceived message (2) 2097153 1572865
[t80040000] [t100001]  3  COMPEER= 4  RCVTAG= 601  RCVVAL= 1
[t80040000] [t100001]  3 --- Misreceived message (1) 4 1
[t80040000] [t100001]  3 --- Misreceived message (2) 1572865 262147
[t80040000] [t140001]  4  COMPEER= 3  RCVTAG= 604  RCVVAL= 4
[t80040000] [t140001]  4 --- Misreceived message (1) 3 4
[t80040000] [t280001]  8  COMPEER= 7  RCVTAG= 607  RCVVAL= 7
[t80040000] [t200001]  2  COMPEER= 1  RCVTAG= 605  RCVVAL= 5
[t80040000] [t200001]  2 --- Misreceived message (1) 1 5
[t80040000] [t200001]  2 --- Misreceived message (2) 262147 1835009
[t80040000] [t180001]  6  COMPEER= 5  RCVTAG= 603  RCVVAL= 3
[t80040000] [t180001]  6 --- Misreceived message (1) 5 3
[t80040000] [t180001]  6 --- Misreceived message (2) 1835009 1048577
[t80040000] [t280001]  8 *** AFTER TEST ***
[t80040000] [t240001]  7 *** AFTER TEST ***
[t80040000] [t200001]  2 *** AFTER TEST ***
[t80040000] [t1c0001]  5 *** AFTER TEST ***
[t80040000] [t180001]  6 *** AFTER TEST ***
[t80040000] [t140001]  4 *** AFTER TEST ***
[t80040000] [t100001]  3 *** AFTER TEST ***

###
				pelegrin@labri.u-bordeaux.fr


