program synch_example
c
c  This is a sample program to show how the various Parallel Library routines
c  can be used to coordinate parallel programs.
c
c  Craig Yankes
c  Digital Equipment Corporation
c  305 Foster Street LTN2-2/H17
c  Littleton, Massachusetts 01460
c  
c
c  Data definitions
c
	implicit none

c
c  define the number of subprocesses that we'll want.
c

c**
c**  Note that this is the *only* place in this program where the number
c**  of subprocesses is specified.  All of the rest of the code is written
c**  to handle any number (greater than zero) of subprocesses.  The code
c**  refers to this parameter in the few number of places where the actual
c**  number is needed.
c**

	parameter number_of_subprocesses = 4

c
c  external routines
c
	integer plib$init_pagefile,
	1	plib$init_signals,
	1	plib$share_memory,
	1	plib$share_code,
	1	plib$current_pc,
	1	plib$create_subprocesses,
	1	plib$signal_subprocesses,
	1	plib$signal_main,
	1	plib$synch,
	1	plib$set_bit_interlocked,
	1	plib$clear_bit_interlocked,
	1	plib$main_pid,
	1	plib$current_image,
	1	plib$delete_subprocesses

	external last_routine

c
c  shared variables
c
	integer iloop, ientry, iflag, task_number, isource(100), idest(100)

c
c  local variables
c
	integer ignore, my_num, subprocess_pids(number_of_subprocesses),
	1	i, my_task, isection_number

	character*8 main_pid
	character*132 image_name

c**
c**  This next definition is the garbage ranges that we'll pad around our
c**  shared memory region.  This technique insures that any variable that
c**  we don't want to share will be pushed onto non-shared pages.  The
c**  problem that we're trying to get around here is that since all memory
c**  sharing is done on full page boundaries, the definition of what will
c**  be shared is from "the first byte of the page containing the starting
c**  address to the last byte of the page containing the ending address."
c**  This raises the problem that this extra section being shared could
c**  have variables that we don't want to share!  The easiest solution we're
c**  going to use is to pad an extra page of unused variable space around
c**  the shared variables.  Therefore, no matter what is "accidently"
c**  shared, the extra shared variables are part of our unused range.
c**
c**  Another way around this problem is to use a LINKER option file to
c**  specify that the PSECT created by this common block should not be
c**  on the same page as any other PSECT.  As a personal preference, I
c**  prefer using the garbage range over the options file since once the
c**  garbage range is added, no further action is needed.  A linker options
c**  file requires one more thing to be remembered at link time...
c**

	byte front_garbage(512), end_garbage(512)

c
c  define the common block that will contain the shared data.
c
	common /test_data/ front_garbage, isource, idest, task_number,
	1	iflag, iloop, ientry, end_garbage


c
c  Perform initialization code.
c

c**
c**  The first thing to be done is to initialize access to the
c**  process private page file.  We are going to instruct this routine
c**  to create a 20 block page file for us and we'll append our main process's
c**  PID to make the name unique.
c**

	ignore = plib$main_pid (main_pid)
	ignore = plib$init_pagefile (main_pid//'page.file',20)

c**
c**  The next thing to do is to set up the shared executable code region.
c**  To find the starting and ending addresses we will use the following
c**  techniques:
c**
c**  Starting address:
c**
c**  In Fortran, it is very difficult for a module to determine its own
c**  entrypoint.  (As opposed to one module trying to determine the entry-
c**  point of another module, which is very easy as we'll show in a moment.)
c**  Therefore, we'll use the PLIB$CURRENT_PC routine to find out the
c**  current virtual address and use this to be the starting address for
c**  the code sharing.
c**
c**  Ending address:
c**
c**  As mentioned in the previous paragraph, it is rather easy for one module
c**  to determine the starting address of another.  Because of this, a dummy
c**  routine has been placed at the end of this program that will serve
c**  as a "bookmark" to denote the end of what we'll set up as being shared.
c**  As another routine, its entrypoint address can be determined by declaring
c**  the routine to be external and perform a %LOC function on its name.
c**  (If this program was made up of multiple source files, this bookmark
c**  routine should be at the end of the last .obj file in the order in
c**  which they are linked together.  Not having it at the end won't create
c**  any errors, thankfully, but will lower the amount of code that is
c**  shared.)
c**
c**  Combining these two methods, we can set up the executable code as being
c**  shared by executing:
c**

	ignore = plib$share_code (plib$current_pc(), %loc(last_routine),
	1	main_pid//'code')


c
c  Perform the last init call for the signalling mechanism.  We'll let
c  it use event flags 64 through 66.  Also, this call will tell us if
c  we are a subprocesses or not.
c

c**
c**  This next call will instruct the PLIB routines to perform the
c**  various process synchronization initialization functions required.
c**  The main item is the initialization of the particular event flag
c**  range as being shared and establishing the event flag numbers to use
c**  for the various purposes.
c**
c**  The value that comes back in MY_NUM will be of interest to us in
c**  a few more lines of code and it will be explained then.
c**

	ignore = plib$init_signals (64, number_of_subprocesses, my_num)


c**
c**  What we are setting up as shared memory by this call is the range
c**  of memory from ISOURCE(1) to the last location that we want to share.
c**  Two notes:
c**
c**  1) Note that we aren't including the "garbage" ranges as part of the
c**  virtual address space to share.  The intent of the garbage ranges is
c**  to pad around the shared variables to force process-private variables
c**  onto non-shared pages.  Including the garbage ranges in the VA to share
c**  would defeat this purpose.
c**
c**  2) Also note that we aren't doing a %LOC(IENTRY) to find the address
c**  to end our memory sharing upon even though IENTRY is the last variable
c**  that we want to share.  The reason for this is very subtle and can
c**  lead to debugging nightmares if violated!  The problem is that the
c**  %LOC directive returns the *starting* byte address for the variable,
c**  not the ending byte address.  If the last variable is anything other
c**  than a byte variable (as in our case), problems will occur if this
c**  variable crosses a page boundary.  If something like this were to
c**  occur to the last variable to be shared:
c**
c**             IENTRY:           |
c**		+--------+--------+--------+--------+
c**		| byte 1   byte 2   byte 3   byte 4 |
c**             +--------+--------+--------+--------+
c**                               |
c**                               | page boundary
c**
c**  and we used the %LOC(IENTRY) as the ending address of the shared region,
c**  the last two bytes of the variable would be in a process-private page!
c**  (Remember, the sharing is done up to "the last byte of the page containing
c**  the ending address".)  We have now accidently created a variable that
c**  is literally half shared and half private!  To get around this problem,
c**  rather than specifying %LOC of the last variable to be shared, specify
c**  %LOC()-1 of the starting location of the ending garbage range.  Since by
c**  definition of common blocks these variables are in contigious memory
c**  locations, specifying the starting address of the end garbage range
c**  minus 1 will equal the virtual address of the *last* byte of the last
c**  variable to be shared.
c**
c**
c**  Also note that the act of setting up the global section will zero the
c**  contents of any variable within the range.  (It is only zeroed when
c**  the global section is created - just mapping to it doesn't disturb the
c**  values within the section.)
c**

	ignore = plib$share_memory (%loc(isource(1)),
	1	%loc(end_garbage(1))-1, main_pid//'MEMTEST')

c
c  next, figure out if this process is the subprocess or not.  If this is
c  a subprocess, jump to its section of code.  If the main process, continue
c  down to the main process code.
c

c**
c**  Remember the MY_NUM variable we received two lines of code before?
c**  Now that we're done initializing everything, we'll use this value
c**  to tell us if we're the main process or not.  A couple of items to
c**  note at this point:
c**
c**  1)  Later on when the subprocesses are created, we'll have them run
c**  the exact same image as the main process is executing.  This has a
c**  number of advantages:
c**
c**	1a - Running the same image allows the main process and the
c**	     and the subprocesses to share the same executable code in
c**	     memory.  Thus a memory savings.
c**
c**	1b - Having a single image as opposed to two (one each for the main
c**	     process and the subprocesses) avoids .EXE version mismatch
c**	     problems.  Also, there is still one .EXE for distribution.
c**
c**	1c - And, probably the most important for ease of debugging, having
c**	     a single .EXE means that there is a single set of variable
c**	     definitions.
c**
c**  2)  Also note that when the subprocesses are created, they will start
c**  to run this executable image from the very first line of code.  Therefore,
c**  be careful not to put main process-only code before executing this
c**  next test.
c**
c**  3)  As an addition to point 2, any Parallel Library routine that should
c**  *not* do its function if called by a subprocess (creating a page file,
c**  for example) does the right thing if called by the main or a subprocess.
c**  Because of this, all of the init calls above can be safely executed
c**  by every process (main or sub) and thus special case handling for
c**  subprocesses does not have to be written into the code.
c**

	if (my_num .gt. 0) go to 1000


c**********************************************************************
c
c	MAIN PROCESS CODE.
c
c
c  first step is to create the subprocesses
c

c**
c**  To guard against problems arising from either renaming this .EXE or
c**  from running it from a different directory, we'll first use the
c**  PLIB$CURRENT_IMAGE routine to get the image name.  Once this is
c**  obtained, create the subprocesses.  (Note, since these subprocesses
c**  will not be performing any I/O to the SYS$INPUT and SYS$OUTPUT logicals,
c**  we'll define these to the NULL device so that we don't get any "file
c**  not found" errors.)
c**

	ignore = plib$current_image (image_name)
	ignore = plib$create_subprocesses (number_of_subprocesses,
	1	image_name, 'NL:', 'NL:', subprocess_pids)

c
c  wait for them to report in as ready.
c

c**
c**  The main process will now wait until the last subprocess reports that
c**  all of them have been created.  The reason why we want to wait here
c**  is that this is the only time (for this program, that is) that the
c**  subprocesses are running concurrently with the main process.  We
c**  want to wait for them to report their completion before assigning
c**  them work to do.  We'll use the PLIB$SYNCH routine to synchronize
c**  with the subprocesses.
c**
c**  If the main process had other work to perform that didn't require the
c**  subprocesses, such as reading a database from a disk, this would be
c**  a good place to do this work so that the act of creating the subprocesses
c**  can be done in parallel with the main process performing its work.
c**  Once the main has done everything it wants to, it can wait for the
c**  subprocesses to be ready by doing an:
c**

	ignore = plib$synch ()


c
c  ok, first section for the subprocesses to execute.  Set the task number
c  to 1, signal for them to be awakened and wait for them to complete their
c  tasks.
c

c**
c**  The TASK_NUMBER variable is how the subprocesses will determine which
c**  subprocess will do which particular piece(s) of work in this parallel
c**  section.  (This will be explained in detail in the subprocess code.)
c**  Since the Parallel Library routines can't guess how this should be
c**  reset, we'll have to manually reset it before every parallel section.
c**
c**  Once it is reset, the main process will then awaken the subprocesses
c**  by calling PLIB$SIGNAL_SUBPROCESSES.  This call will specify for the
c**  subprocesses to execute the first parallel section and for the main
c**  process to wait for their completion before returning.
c**

	task_number = 1
	ignore = plib$signal_subprocesses (1,1)

c**
c**  If there was any single-stream work that the main process has to do
c**  execute before entering the second parallel section, it would go here.
c**

c
c  The subprocesses are done executing the first parallel section.  Reset
c  the task number and signal for them to execute the second parallel
c  section.  Again, the main process will wait for the subprocesses to
c  complete before continuing.
c

c**
c**  This parallel section will be executed 3 times to show that the
c**  parallel section numbers do not have to unique for every time the
c**  subprocesses are awakened.
c**

	do i = 1,3
	   task_number = 1
	   ignore = plib$signal_subprocesses (2,1)
	enddo

c
c  All done, delete the subprocesses, write out the loop and entry counts
c  and exit.
c

c**
c**  The subprocess deletion will be performed by calling the
c**  PLIB$DELETE_SUBPROCESSES routine using the PID list obtained
c**  when they were created.
c**

	ignore = plib$delete_subprocesses (number_of_subprocesses,
	1	subprocess_pids)


c
c  Since the calls to PLIB$SET_BIT_INTERLOCKED have used the optional form,
c  write out the total loop count and entry count that was derived.
c

c**
c**  Watching the LOOP and ENTRY counts can point out some bottlenecks in
c**  in the paralleled code.  For example, if the LOOP count is significantly
c**  higher than the ENTRY count (always more than double or triple), it
c**  shows that the subprocesses are being paused awaiting entry into
c**  critical sections too often.  The cause of this is generally either
c**  that the critical section is too large or else the amount of work
c**  being executed in a single task is too small.  (In either case, it
c**  turns into a study of the ratio between the amount of code required
c**  in the critical section for next task scheduling compared to the amount
c**  of work in the task to be done.)
c**
c**  A very interesting case is where the LOOP and ENTRY counts are normally
c**  close to each other (LOOP averaging 10-20% higher than ENTRY) but
c**  periodically the LOOP count increases to being hundreds to thousands
c**  of times what the ENTRY count is.  What can cause this situation is
c**  if there are more computable processes in the system than there are
c**  available compute elements.  In this situation, the subprocesses spend
c**  some of their time on the scheduling queue and not executing.  If a
c**  particular subprocess is interrupted and placed back on the scheduling
c**  queue while it is in a critical section (i.e. owns a lock), any other
c**  subprocess that tries to do a PLIB$SET_BIT_INTERLOCKED on that same
c**  flag will continually loop until it is interrupted and put back onto
c**  the scheduling queue.  This situation continues until the subprocess
c**  that owns the lock is rescheduled for execution.  The name of this
c**  situation is "compute block" where the progress of an executing subprocess
c**  is blocked by a lock owned by another subprocess that is not currently
c**  executing.  This situation cannot be totally avoided, but two methods
c**  can be implemented to minimize the impact of this:
c**
c**  1)  Don't try to run significantly more subprocesses than the number
c**  of compute elements.  Trying to run ten subprocesses on a VAX 8800
c**  (which has two compute elements) will create the compute block
c**  situation even if there are no other processes on the system.
c**
c**  2)  Keep critical sections as small as possible.  The faster they are
c**  executed, the lower the chance that the end of the subprocess's compute
c**  quantum will occur during a critical section.
c**
c**  If efficiency is very critical to an application, the program can
c**  monitor these values to determine how many subprocesses should be
c**  executing.  If, for example, two compute-bound paralleled programs are
c**  running on a VAX 8800, they will both tend to run more efficiently
c**  if they each use only a single subprocess rather than two (or more).
c**  If the program sees the LOOP-to-ENTRY ratio climbing, it can set
c**  shared variables to say that only a portion of the subprocesses should
c**  be executed.  (How this can be done will be described in the subprocess
c**  code.)

	write (6,9924) iloop, ientry
9924	format (' The demo program is complete.  The counts from ',
	1	'PLIB$SET_BIT_INTERLOCKED are:',/,
	1	'   Loop Count = ',i,'  Entry Count = ',i)

	call exit


c
c-------------------------------------------------------------------------
c  come here if we are a subprocess.
c

c**
c**  Before we do anything else, lets review what a subprocess has done
c**  prior to coming to this section of code.  Each subprocess has:
c**
c**	1)  Initialized the signaling mechanisms and page file,
c**
c**	2)  Has mapped to the shared memory global sections, and,
c**
c**	3)  Has determined that we're a subprocess and thus jumped to here.
c**	(And, incidently, has a subprocess unique number in MY_NUM that
c**	can be useful in debugging.)
c**
c**  It is important to keep in mind that all of the executable code prior
c**  to the decision point of whether we are a main process or a subprocess
c**  is executed by ALL of the processes and subprocesses!  Now, lets see
c**  what the subprocesses have to do:
c**

c
c  tell the main process that we're ready for the next functional task
c  and wait for it to wake us up.
c

c**
c**  Every subprocess will execute this call to PLIB$SIGNAL_MAIN and when
c**  the last one has called it, the main process will be signalled via
c**  an event flag.  The ISECTION_NUMBER parameter is how the subprocesses
c**  will receive the section number parameter from the main process's call
c**  to PLIB$SIGNAL_SUBPROCESSES.
c**

1000	ignore = plib$signal_main (isection_number)


c
c  We've been awakened and so there is work for us to do.  The first thing
c  we have to do is to check the ISECTION_NUMBER shared variable to tell us
c  what piece of code to enter.
c

c**
c**  As mentioned earlier, the application might not want to have every
c**  subprocess actually execute the parallel sections to avoid the 
c**  compute block situation if there are many other processes running on
c**  the system.  To implement this, two things are required:
c**
c**  1)  There should be a shared variable that denotes the maximum number
c**      of subprocesses to actively participate in the parallel sections.
c**      This variable would normally be maintained by the main process.
c**
c**  2)  As soon as the subprocesses are awakened, each subprocess would
c**      compare its own subprocess number against the shared variable
c**      described in point 1.  If the subprocess number is larger than
c**      the desired maximum subprocess number, it would immediately
c**      branch back to statement label 1000 to report its own completion.
c**
c**  Some notes on implementing this:
c**
c**  1)  Each subprocess requires a unique number in the range of 1 to
c**      the number of subprocesses.  The MY_NUM parameter on the
c**      PLIB$INIT_SIGNALS routine conveniently provides this.
c**
c**  2)  Implicit with being able to change the number of executing
c**      subprocesses is the notion that the work that each subprocess
c**      will do can't be hardwired into the code.  This is not a difficult
c**      goal to achieve if task scheduling is used as opposed to hardwiring
c**      DO loop indexes, for example.
c**


c- - - - - - - - - - -  P A R T  1 - - - - - - - - - - - - - -
c
c  if isection_number is a 1, it means that we're to repeatedly get the next
c  task number and write it out to a disk file.
c
	if (isection_number .eq. 1) then
c
c  until we're done (i.e. no more tasks available), keep doing work
c

c**
c**  This next rather unglamorous line of code forces an infinite loop until
c**  the subprocess realizes that there is no additional work available for
c**  it to do.
c**

	   do while (1 .eq. 1)
c
c  get the next task available for work.  *** Note, this must be in a
c  critical section. ***
c

c**
c**  This is a standard critical section.  A "critical section" is a piece
c**  of code that can only be executed by only one subprocess at a time.
c**  These occur where the subprocesses are modifying a particular shared
c**  variable that the other subprocesses are going to use.  The purpose
c**  of the critical section is to avoid race conditions between the
c**  reads and writes.
c**
c**  What we're doing here is allocating the next task that is available for
c**  work.  A "task" by definition is an independent piece of work within the
c**  parallel section.  This parallel section is to write out 100 numbers
c**  along with the subprocess number that wrote it out.  The execution of
c**  each individual number has been defined here as a separate task.
c**
c**  Most critical sections are no more complex than this one and perform
c**  a similar function.
c**
c**  The first step to a critical section is to attempt to set a flag bit
c**  using the interlocked VAX instructions.  The PLIB$SET_BIT_INTERLOCKED 
c**  routine will not return until its subprocess has successfully set the
c**  bit.  Therefore, the lines of code that follow will not be executed
c**  until that subprocess owns the flag and thus the permission to enter
c**  the critical section.  (Note, this example uses the optional form
c**  of the routine to get the LOOP and ENTRY count values.)

	      call plib$set_bit_interlocked (iflag, iloop, ientry)

c**
c**  Now that we have obtained permission to enter the critical section,
c**  the first thing that we'll do is to make a process-private copy of
c**  the shared "what is the next available task" variable.  (Note that
c**  the TASK_NUMBER variable was initialized by the main process before
c**  awakening the subprocesses.)
c**

	      my_task = task_number

c**
c**  Next, increment the TASK_NUMBER variable so that the next subprocess
c**  will get the next task and not the one that we just allocated.
c**

	      task_number = task_number + 1

c**
c**  All done with the critical section and thus the flag variable should
c**  be cleared.  This will permit a subprocess that is waiting to enter
c**  this critical section to enter.
c**

	      call plib$clear_bit_interlocked (iflag)

c**
c**  Since this section only has 100 defined tasks within it, we need
c**  a check to see if we have run out of things to do.  If the number
c**  we just allocated is greater than 100, go back to where the subprocess
c**  will count-down and report completion.
c**

	      if (my_task .gt. 100) go to 1000

c**
c**  We now know that we have a valid task to perform.  The "work" to be done
c**  in section 1 is just to write out who we are, the parallel section
c**  part number and the task number we just allocated.  (Basically, this
c**  is where the "real task work" (as opposed to this code) would be
c**  executed once a task was allocated and validated.)
c**

	      write (1,1100) my_num, isection_number, my_task
1100	      format (' I am subprocess number ',i2,', am in part ',i2,
	1	' and am executing task ',i)


c**
c**  ...and loop back for any additional tasks that can be executed.
c**
	   enddo

c**
c**  And this is the end of Parallel Section 1.
c**
	endif


c- - - - - - - - - - -  P A R T  2 - - - - - - - - - - - - - -

c**
c**  Since the structure of this parallel section is so identical to the
c**  previous (as will be the case for the controlling mechanisms of most
c**  sections), comments will only be put in for specific things.
c**

c
c  second type of thing to do:  we are going to copy one array into another
c  in groups of 10 per task allocation.
c
	if (isection_number .eq. 2) then

c  loop forever until we determine we're done

	   do while (1 .eq. 1)

c
c  lock the scheduling bit and allocate the next task.  Note that we're going
c  to increment the "next task" by ten since we're doing groups of ten.  This
c  way, the "next task" variable always shows the starting array location for
c  the next chunk of ten.
c

c**
c**  As mentioned in the comment above, note that we are not incrementing
c**  the TASK_NUMBER to the next task, but rather to the starting array
c**  location for the next task.  This saves us from having to later multiply
c**  the task number by 10 to derive this number.
c**

	      call plib$set_bit_interlocked (iflag)
	      my_task = task_number
	      task_number = task_number + 10
	      call plib$clear_bit_interlocked (iflag)

c
c  are we done?  If so, loop back to where we report completion.
c
	      if (my_task .gt. 100) go to 1000

c
c  Aha!  We have a valid task to perform.  Move these ten locations.
c
	      do 1200 i = my_task, my_task + 9
	         idest(i) = isource(i)
1200	      continue

c
c  and loop back for the next task of any are available.
c
	   enddo
c
c  and the end of this parallel section.
c
	endif


c- - - - - - - - - - -  P A R T  unknown - - - - - - - - - - - - - -
c
c  Hmmm, if we got to here we are seeing a part number that we really
c  didn't expect to see.  Write out an error message and exit since this
c  is normally a fatal logic error.
c

c**
c**  Just polite code that does error checking.  Basically, if we got
c**  here something is wrong.  (And, probably, it is that the ISECTION_NUMBER
c**  variable is not being properly shared.  Notice that there is no section
c**  zero???  This was done purposely since if the ISECTION_NUMBER is not
c**  shared and the subprocesses wake up, they will see it as a local variable
c**  that happens to contain a zero.  You don't want the subprocesses thinking
c**  they have received a valid work request when they haven't...)
c**
c**  Also note that we're not writing this error message to SYS$OUTPUT since
c**  that was defined for the subprocesses as being the null device.
c**

	write (1,1400) my_num, isection_number
1400	format ('  ***** Error in subprocess number ',i2,'.',/,
	1	'        Specified part number = ',i)
	call exit

	end


	integer function last_routine

c**
c**  This is the bookmark routine to locate the end of the executable code
c**  that is to be shared.  Note, placing this routine here and not as a
c**  separate .OBJ at the end of the linker file list means that the
c**  parallel library routines will not be a part of the shared code.
c**  To get the maximum efficiency nder normal circumstances (i.e. real
c**  code), the PLIB routines (along with any other user routines) should
c**  be also shared.
c**

	return
	end