\documentstyle[11pt,epic,psfig]{article}
\sloppy
\setlength{\textwidth}{6.5in}
\setlength{\textheight}{9in}
\setlength{\topmargin}{-0.5in}
\setlength{\oddsidemargin}{0pt}
\setlength{\evensidemargin}{0pt}

\pagestyle{plain} 

\newcommand{\pw}[1]{\ \mbox{\bf #1}\ } % "pw" stands for "program word"

\newcommand{\outline}[1]{%
\begin{center}
\fbox{ \begin{minipage}{415pt}
\vspace{1 ex}
#1
\end{minipage} }
\vspace{1 ex}
\end{center}
}

\begin{document} 

\bibliographystyle{plain}

\date{} 

\title{\Large \bf Parallel Adaptive Mesh Refinement and Redistribution on 
                   Distributed Memory Computers} 

\author{ C. {\"{O}}zturan,
H. L. deCougny,
M. S. Shephard,
J.E. Flaherty \\
 Scientific Computation Research Center  \\ 
 Rensselaer Polytechnic Institute \\ 
 Troy, NY 12181 
}
 
\maketitle 
\thispagestyle{empty} 

%\section{Introduction} 
%\section{Problem Statement}
%\section{Related Work}
%\section{Parallel Mesh Refinement}
%\section{Mesh Redistribution}
%\section{Results and Conclusion}


%\tableofcontents
%\clearpage

\centerline{\bf Abstract}
{\em
A procedure  to support parallel
refinement and redistribution of two dimensional unstructured 
finite element meshes on distributed memory computers is presented.
The procedure uses the mesh topological entity hierarchy
as the underlying data structures to easily support the required 
adjacency information.
Mesh refinement is done by employing links back to the geometric representation
to place new nodes on the boundary of the domain directly on the curved geometry.
The refined mesh is then redistributed by an iterative 
heuristic based  on the Leiss/Reddy~\cite{leiss} load balancing 
criteria. A fast  parallel tree edge-coloring  algorithm  
is used to pair processors having adjacent partitions and forming
a tree structure as a result of Leiss/Reddy load request criteria.
Excess elements are iteratively migrated from heavily loaded to 
less loaded processors until load balancing is achieved.
The system is implemented on a massively parallel MasPar MP-2 system with a
SIMD style of computation and uses
message passing primitives to migrate elements during the mesh redistribution
phase. Performance results of the redistribution
heuristics on various test meshes are given.
}

\section{Introduction}
\label{labintro}
  Adaptive finite element methods, driven by automatic estimation and
control of errors  have gained importance recently due to
their ability to offer reliable solutions to partial differential
equations~\cite{flaherty}.
An adaptive method starts with a solution on a 
coarse mesh using a low-order 
method and, based on an estimate of the global and 
local 
errors, either refines the mesh ({\it h-refinement}) and/or increases the order of
numerical solution ({\it p-refinement}).
 The sequential implementations of these methods have proved
to be very efficient by concentrating computations on regions of high
activity and  by providing exponential rates of convergence when proper
combinations of h- and p-refinement are employed.
 
The irregular and evolving behavior of the computational load in  adaptive strategies on
complex domains becomes problematic when parallel distributed-memory
machine implementations are considered. Complete parallelizations of
these methods necessitate additional and difficult stages  of
partitioning, parallel refinement  and the redistribution of the refined mesh.
Many heuristics have been devised to partition the initial unstructured
mesh and hence minimize the load imbalance and interprocessor communication
among processors. The redistribution of the refined mesh can also be
done by parallelizing similar partitioning heuristics. We note, however, that
global methods such as recursive mesh subdivision techniques 
which operate on the whole mesh do not take effective advantage of the
incremental  changes in the refined mesh. Changes in the 
mesh might not call for redistribution because the cost of
redistributing may be  more than the cost of  performing the computations
with a slightly imbalanced load. More importantly, the redistribution may involve only local
adjustments. Therefore the use of heuristics  that 
operate locally by migrating elements between the mapped
neighboring partitions is an attractive alternative.
 
This paper presents a procedure for parallel
refinement and redistribution of two dimensional finite element  meshes on 
distributed-memory computers.
 Section~\ref{labrelated}, reviews the currently
available partitioning and redistribution techniques and discusses the 
suitability of these techniques for adaptive techniques.
Sections~\ref{labmeshh}  and \ref{labhref} present
the data structures that are used to store the mesh, the adjacency links
between the processors and the specific geometric operators needed to  handle 
h-refinement on curved boundaries.
The refined mesh is redistributed by employing an iterative
heuristic based  on taking pairs of processors having adjacent partitions and
migrating elements from heavily loaded to less loaded processor.
We describe the redistribution algorithm in Section~\ref{labredis}. 
Finally, we report performance results of the redistribution
heuristic based on SIMD style of computation and message passing 
primitives on a 2048-processor  massively parallel MasPar MP-2 system.

\section{Related Work}
\label{labrelated}
  The current repertoire of partitioning and redistribution algorithms 
can be classified into three categories.
\begin{enumerate}
\item {\it Recursive Bisection (RB) Techniques} which repeatedly 
split the mesh into two-submeshes. {\it Coordinate RB} methods bisect the 
elements by their spatial coordinates. If the axis of bisection is 
Cartesian, then it is called {\it Orthogonal RB} \cite{berger2}. If the axes are chosen to
be along the principal axis of the moment of inertia matrix, then it is called
{\it Moment RB}. 
 {\it Spectral RB} is another  method which utilizes the properties of 
the {\it Laplacian matrix}~\cite{fiedler} of the mesh connectivity graph and bisects it
according to the eigenvector corresponding to the second smallest eigenvalue of this matrix~\cite{pothen}.
\item{\it Probabilistic Methods} which include simulated annealing and
genetic algorithms. These methods however require many iterations and are
expensive to use as mesh partitioning methods~\cite{williams}.
\item {\it Iterative Local Migration Techniques}  exchange load 
between neighboring processors to improve the load balance and/or decrease the
communication volume. 
The definition of processor neighborhood 
can either be the hardware link or the connectivity of the split 
domains.  The {\it Cyclic pairwise exchange} \cite{hammond} 
pairs processors connected by a hardware link and exchanges the 
nodes  of the mesh to improve the communication.  Leiss/Reddy on the 
other hand uses the hardware link as the neighborhood to transfer work from 
heavily 
loaded to less loaded processors. The {\it Tiling} algorithm 
\cite{devine}\cite{wheat}
extends the Leiss/Reddy algorithm to the case where the neighborhood 
is defined by the connectivity of the split domains. Lohner et al.~\cite{lohner}
algorithm exchange elements between subdomains
according to a deficit difference function which reflects the imbalance 
between an element and its neighbors.

\end{enumerate}

  Adaptive methods refine the mesh incrementally and hence require  periodic
redistribution of finite elements.   If used on the whole mesh, 
RB methods   require complete remapping of the  elements  and therefore 
have a fixed cost. 
 This introduces substantial overhead due to 
transfer of all elements to their new destinations. RB methods can 
handle the heterogenous load distributions on each element by assigning
weights to each element.  Recent efforts incorporate
the communication costs into the ORB method by considering the weights on the 
cut edges of the split domains~\cite{shadid}. RB methods
applied {\it globally} on the whole mesh  are too costly to be used 
repeatedly to redistribute a mesh dynamically with an adaptive technique.
They may however, offer their  advantages if applied {\it locally } to 
the incrementally altered mesh~\cite{berzins}.

  Iterative local migration techniques offer important advantages with
an adaptive technique. Since these methods perform 
local transfers, incremental changes in the mesh can be propagated to 
the processors to load balance and reduce communication
volume without solving an expensive global partitioning problem.
The elemental cost, which is the sum of the computational 
workload represented by the degrees of freedom (dofs) and the communication,
represented by the dofs on the partition boundary edges, 
can also be handled 
 by selecting 
elements according to a cost function for transfer. A disadvantage of 
iterative local migration techniques is that many iterations may be required
to regain global balance and hence 
elements reach their final 
destination after many local transfers 
rather than directly.  Assuming each iteration is fast 
the cost due to local transfer steps is amortized
by the smaller number of elements that are moved. 
Hence, they should be 
advantageous for use with adaptive techniques.

  The redistribution strategy given here is an iterative
local migration scheme. It is based on the Leiss/Reddy~\cite{leiss} algorithm 
and employs selection criteria similar to Wheat et al. \cite{wheat} in transferring 
elements. Unlike, these approaches, however, the processors are paired 
during load transfers similar to the pairwise exchange heuristic 
used by Hammond~\cite{hammond}. Our pairing procedure 
does not pair processors connected by 
hardware link in the static processor graph, but rather in the dynamically
changing graph representing the partitioned mesh (see Section~\ref{labredis}). 

\section{Mesh Data Structures}
\label{labmeshh}
  Parallel h- and p-refinement and dynamic redistribution 
algorithms 
for
the refined mesh
require various mesh adjacency and mesh entity classification information.
 The current procedure operates on the triangular unstructured meshes generated
by the shell capability of 
the {\it Finite Octree} procedure~\cite{shephard}. The data structure
used in this mesh generator is the complete mesh  topological
{\it entity hierarchy}~\cite{weil85} which provide a two-way link between the
mesh entities of consecutive order, i.e.,
 regions, faces, edges, and vertices.
Although this hierarchical data structure requires more memory than
classic finite element data structures (e.g., element-node relationship), it has
proven to be powerful especially in the context of refinement. It is
quite clear that h-refinement benefits from a complete hierarchy to delete
and create mesh entities efficiently. The presence of a complete
hierarchy is also very useful with the p-version of the finite element method,
since it easy to attach the edge and interior modes~\cite{szabobook} 
to the mesh entities. Additionally,
mesh entities are explicitly classified against the geometric model describing
the domain boundary.
In 
two dimensions, mesh faces are always classified as being in the interior of the 
domain, 
mesh edges 
are classified either in the  interior or on the boundary edge, and mesh vertices
are classified either in the interior, on boundary edge, or on a boundary vertex.
Classification guarantees that the mesh is still valid after 
h-refinement. Also, any mesh vertex classified on a boundary edge stores its
parametric coordinate on the corresponding boundary edge. Since generally 
large elements are used in the p-version, it is important to represent 
edges on curved boundaries accurately. The readily available classification
and the parametric coordinates make it easy to implement special 
boundary-element mapping techniques.  Figure~\ref{labfig1}(a) illustrates the 
relationship between the entities and the geometric model in the data
structures.

\begin{figure}[htb]
 \begin{center}
\makebox[6.375in][l]{
  \vbox to 2.208in{
    \vfill
    \special{psfile=fig1.ps}
  }
  \vspace{-\baselineskip}
}
  \end{center}
  \caption{ \sl (a) Mesh data structures (b) across processor links}
\label{labfig1}
\end{figure}

Because mesh entities are distributed
across processors corresponding to the current partitioning, additional
data has to be stored in order for a partition to have access to
information stored on neighboring partition. In two dimensions,
each mesh edge on a partition's boundary points to a corresponding
identical, duplicate mesh edge on the neighboring partition's boundary.
Storing links between these duplicate edges
 is sufficient to allow for inter-processor mesh entity access.
The additional information regarding inter-processor 
 vertex connectivities is obtained by sending messages and
following the edge links across processors.  Although this approach
introduces additional communication, it saves substantial  amount of memory by alleviating
the problem of storing variable length vertex connectivity lists.
Figure~\ref{labfig1}(b) shows an example of stored edge links across
partition boundaries.

\section{Parallel H-refinement} 
\label{labhref}
The h-refinement procedure is edge-based; thus 
each
mesh edge in the mesh is either marked for refinement or not. 
Duplicate mesh edges on partition boundaries must be 
marked identically. Each mesh face 
can have from zero to three marked mesh edges. For each possible
configuration, a template has been defined~\cite{WEBS93a} as shown in 
Figure~\ref{labfigref}: 
\begin{enumerate}
\item 1-edge: the triangle is divided into two (also known as a {\it green} 
   subdivision~\cite{BANK81}).
\item 2-edge: the triangle is divided into three. In this case, there is
   always a choice between two subdivisions. In order to limit element
   quality degradation, the cut along longest edge is performed
   first~\cite{RIVA84}.
\item 3-edge: the triangle is subdivided into four. 
   For simplicity, the subdivision that produces 4 triangles similar to
 the parent is chosen (also known as a {\it regular } subdivision \cite{BANK81}).
\end{enumerate}
  
Elements are subdivided by traversing the list of mesh faces
known by the processor in parallel. Once all mesh faces have been
visited, mesh edges resulting from refinement on a
partition boundary are linked to their identical duplicates on the neighboring
partition.  The refined parent entities are not deleted immediately and 
separate operators are provided for this purpose. 
In this way, information can be transferred from the parent
entities to their off-spring before the deletion.

\begin{figure}[htb]
\centerline{
\vbox{\psfig{file=fig2.ps,width=4.0in,height=1.3in}}
}
\caption{\sl Refinement templates.}
\label{labfigref}
\end{figure}

Any mesh vertex or edge resulting from the refinement of a mesh edge
inherits the classification of the parent edge.
With curved geometry, refinement vertices
that are classified on the model boundary are
snapped to the proper model entity. The parametric coordinate of
any refinement vertex classified on the model boundary is obtained as a 
simple average of the parametric coordinates of the two end vertices
of the parent edge. The location of the refinement vertex is then obtained
by querying the geometric modeler. It is possible that the location
of the refinement vertex induces negative jacobians for the surrounding
elements. In this case, one has to either: $(i)$ slightly move the location
of the refinement vertex, $(ii)$ remove the elements with negative jacobians,
and/or $(iii)$ add an additional vertex.

\section{Mesh Redistribution}
\label{labredis}

The  
redistribution algorithm and its similarities and differences with other 
implementations \cite{leiss}\cite{devine}\cite{wheat} are described using an example.
Consider the unbalanced mesh distribution over nine processors as shown
in Figure~\ref{labfig3}(a). Let $G_P(V,E)$ be  a {\it partition graph} with
each vertex in $V$ representing a partition assigned to a processor and $E$ representing the 
set of edges between partitions. Two partitions $u$ and $v$ are 
connected by an edge $(u,v) \in E$ if they share a mesh edge.
If two partitions share only a mesh vertex, then they are not considered
adjacent in the partition graph.   
The reason for the mesh edge connectivity requirement between
partitions in $G_P$ is twofold.  First, by excluding vertex 
adjacency, the number of edges in $E$ and, hence,
the time to communicate with  adjacent processors  is kept minimal. Second, 
transferring elements to a partition which share only a node
grows  a partition with two regions connected only by a single vertex.
This, in turn, results in a higher surface to volume ratio which increases
communication cost.
Figure~\ref{labfig3}(b) shows the partition graph obtained 
from the mesh distribution in 3(a).
 
\begin{figure}[htb]
 \begin{center}
\makebox[5.444in][l]{
  \vbox to 4.652in{
    \vfill
    \special{psfile=fig3.ps}
  }
  \vspace{-\baselineskip}
}
  \end{center}
  \caption{ \sl (a) Unbalanced load in each partition,
                (b) partition graph $P_G$,
                (c) load request,
                (d) load transfer between pairs in steps 0,1,2,3.}
\label{labfig3}
\end{figure}

Following Leiss/Reddy~\cite{leiss}, a 
workload deficient processor 
will request work from its most heavily loaded
neighbor. As a result, a processor can 
receive multiple requests but can only request load from one processor.
This pattern of requests produces a load hierarchy 
and forms a forest of trees $T_i$ as shown in Figure~\ref{labfig3}(c).
The  trees $T_i$ in the forest are subgraphs of the partition 
graph $G_P$. 

The current proposed algorithm for redistribution pairs 
the processors on each tree $T_i$ and transfers load from 
the heavily loaded pair to the other. The pairing of processors
is equivalent to coloring the edges of each tree $T_i$ with colors 
representing separate load transfer (communication)
cycles. The edge coloring approach synchronizes the load transfer between
neighboring processors  and differs from the  approach of Wheat et al.~\cite{wheat}.
 According to our implementation, processors
$P_2$ and $P_7$ in Figure~\ref{labfig3}(b) would be engaged with
load transfer with only one neighbor at a single transfer step. 
Wheat et al.~\cite{wheat}, however, would let processor $P_2$ and $P_7$
receive and send work during the same transfer step.  The latter approach
would be more efficient if there were load 
transfer in one direction only, i.e. from heavily loaded to less loaded 
processor.
Processors $P_2$ and $P_7$ would be doing useful work packing elements
to be transferred to their offspring while their parents pack elements
to be transferred to them.  In the current implementation, processors $P_2$ 
and $P_7$ would remain idle waiting for load transfer from parents and
would transfer load to offspring after the transfer from the parent has been 
completed.
However, this disadvantage is overshadowed by a number of 
advantages. Firstly, smaller number of messages and the 
synchronous transfer of load increase communication performance.
Second, since the processors are synchronized by pairs, 
a greater repertoire of selection criteria to decide which elements 
to transfer can be employed. Unlike~\cite{wheat}, where elements can be
transferred from only heavily load to less loaded processors, the pairing 
allows elements to be transferred from the less loaded to the heavily loaded
pair. This can be useful in improving the surface to volume ratio of 
the partitions. Since there is no explicit synchronization by 
edge coloring in~\cite{wheat}, this approach would make implementation of  
this bidirectional transfer of load extremely difficult. 

Figure~\ref{labfig3}(d) shows the coloring phase that we employ to pair the
processors. If $\Delta(G)$ denotes the maximum vertex degree (number 
of edges incident on a vertex) in a graph $G$, then 
Vizing's theorem~\cite{vizing} 
shows that the graph $G$ can be edge-colored using $C$ colors where 
$\Delta(G) \le C \le \Delta(G)+1 $.  For some special graphs including 
the trees 
the number of colors needed is exactly $\Delta(G)$. Therefore, 
$\Delta(T_i)$ colors are required to color the tree $T_i$. 
The main steps of the {\tt REDISTRIBUTE} algorithm are illustrated in
Figure~4.

\begin{figure}[htb]
\begin{center}
\begin{tabbing}
....\=....\=....\=....\=......\=......\=......\=......\=\=......\=......\=\kill
\underline{\pw{Algorithm REDISTRIBUTE:}} \\
\pw{Input:} \> \> \> \> An initial mesh distributed on the \\
            \> \> \> \> processors. \\
\pw{Output:} \> \> \> \> Redistributed mesh with reduced imbalance. \\
\>  \\
1. \> \pw{While}  not converged \pw{do}           \\
2. \> \> Compute neighboring load differences.       \\
3. \> \> Request load from neighbor processor having \\
   \> \> largest load difference. (creates processor tree T) \\
4. \> \> Determine amount of load to be sent or received. \\
5. \> \> Set-up Euler tour adjacency links for tree T.\\
6. \> \> Color the tree T by Euler touring.    \\
\> \\
7. \> \> \pw{for each} color $C$ \pw{do} \\
8. \> \> \> \pw{if} processor owns color $C$ \pw{and} is a neighbor \\
   \> \> \> \> of color $C$ pair processor \pw{then}\\
9. \> \> \> \> Transfer load between the pair processors. \\
   \> \> \pw{end for} \\
   \> \pw{end while} 
\end{tabbing}
\end{center}
\begin{tabular}{l}
\hspace{6.7in} \\ \hline 
\end{tabular}
\label{lbalgfig}
\caption{\sl Redistribution algorithm.} 
\end{figure}

\noindent
Before presenting the details of REDISTRIBUTE, a summary 
of the steps is given.
\begin{itemize}
\item Step $1$:  The steps of transferring work between 
                 paired processors is iterated until the load on each 
                processor converges to a value close to the optimal
                balance. 
                  The convergence of the Leiss/Reddy 
                  algorithm and the current  algorithm's convergence criteria 
                 is given in Section~\ref{labconvergence}.
\item Step $2$:   Load differences are computed by having each processor
                  send a load value to its neighbors and 
                  correspondingly receive load values from its neighbors.
                  This step takes $\Delta(G_P)$ time.
\item Step $3$:   The Leiss/Reddy~\cite{leiss} load request process is 
                  invoked 
                  and results in the forest of trees $T_i$.  The edges of 
                  $G_p$ is simply marked when a request has been
                  made indicating whether or not  it is a tree edge.
                  Since the incoming requests for load should be sorted, 
                  this step takes $O(\max_i\{d_i \cdot \log d_i \})$  time 
                  where $d_i=\Delta(T_i)$. 
\item Step $4$:   Deciding how much load to transfer to requesting 
                  processors is crucial in making the redistribution 
                  algorithm to converge. 
                  Criteria for this step are given with 
                  convergence criteria in Section~\ref{labconvergence}.
\item Steps $5-6$:   To facilitate efficient parallel scan operations
                   on the trees $T_i$, each tree is 
                  linearized by constructing an {\it Euler Tour} 
                  on it. 
                 The details of the Euler Tour construction can be 
                 found in~\cite{JaJa}. 
                 In the present implementation, each processor 
                  stores the links to its adjacent processors; hence, this 
                 list is traversed and local addresses 
                  sent  to the adjacent processors. The received
                  messages are sorted and the links
                  stored. 
                  Hence, constructing Euler Tour takes  
                 $O(\max_i\{d_i \cdot \log d_i \})$ time.  
                  Having constructed the 
                  Euler Tour, the tree is colored by employing scan 
                   operation on the linearized tree. The complexity
                   of coloring is 
                  $O(\max_i\{d_i \cdot \log |V_i| \})$ \footnote{ \parbox[t]{6.06in}{A recent
                  suggestion by Szymanski to use the depth first traversal
                  links as has been used in~\cite{szymansk} enables to 
                  get an $O(\max_i\{\log |V_i| \})$ for the parallel 
                  scanning step.}}
                   where $|V_i|$ denotes the number of vertices in tree $T_i$.
                  The details of tree edge coloring are given in 
                  Section~\ref{labcolor}.
                   
\item Steps $7-8$:  The steps of load transfer are synchronized by 
                 the edge coloring of the tree. One iteration 
                  of the redistribution algorithm involves $C$ steps
                  corresponding to the $\max_i(\Delta(T_i)) $ colors. Note
                  that this synchronization also allows for bi-directional
                  transfer of load between pair processors. 
\item Step $9$  selects elements to be transferred. A 
                 cost function is associated with each element to be transferred. 
                 This cost reflects the communication as well as the 
                 computational cost of the element.
                 The details of this 
                 procedure are given in Section~\ref{labelemmove}.

%\item Steps $9$:   Select elements to be transferred. We associate a 
%                    cost function with each element to be moved. 
%                   This cost reflects the communication as well as the 
%                   computational cost of the element.
%                   Complexity. We maintain a list of edges on the 
%                  partition boundaries.   We give the details of this 
%                  procedure in section~\ref{labelemmove}
\end{itemize}

\subsection{Fast Tree Edge-Coloring Algorithm to Pair Processors}
\label{labcolor}

   Given a tree  $T(V,E)$ with  vertices $V$ and edges $E$, an Euler Tour,
$T_{Euler}(V,E')$, can be  constructed by replacing each edge $(u,v) \in E$ 
by two directed arcs  $<u,v> \in E'$ and $<v,u> \in E'$  and setting up 
Euler Tour adjacency links.
   The directed graph $T_{Euler}(V,E')$ linearizes
the tree $T$ and allows efficient global scan operations to be done  by 
performing
parallel pointer jumping.  The edge coloring
of the tree is performed by assigning special weights to the arcs in 
$T_{Euler}$, scan summing these weights and taking modulo the maximum tree
vertex degree, $\Delta(T)$,  of the summed weights.  

   Let $parent(u)$ denote the parent vertex of $u$ in the rooted tree $T$,
$num\_child(u)$ denote the number of children of $v$,  and 
$child\_rank(u)$  denote the left to right rank of children of a parent
vertex. Hence, if $u$ is a parent vertex, then its leftmost child has 
$child\_rank=1$ and its rightmost child has $child\_rank=num\_child(u)$.  This
classifies the arcs of $T_{Euler}$ into three types, i.e.,  $<u,v>$ is:
\begin{itemize}
\item a {\it forward arc}  if $u=parent(v)$.
\item an {\it interior backward arc} if 
          $v=parent(u)$ and $child\_rank(u) < num\_child(v)$
\item a {\it last backward arc} if 
          $v=parent(u)$ and $child\_rank(u)= num\_child(v)$ 
\end{itemize}
   
 With these definitions, the tree edge-coloring algorithm is 
illustrated in Figure~5.

\begin{figure}[htb]
\begin{tabbing}
....\=....\=....\=....\=......\=......\=......\=......\=\=......\=......\=\kill
\underline{\pw{Tree Edge Coloring Algorithm:}} \\
\pw{Input:} \> \> \> \>  A tree $T$ with an Euler Tour $T_{Euler}$ defined on it. \\
\pw{Output:} \> \> \> \> Edge coloring of the tree on forward edges. \\
\\
1.  \> \> Assign weights to each arc $<u,v>$: \\
    \> \> \> $\bullet$ weight=1 if forward arc. \\
    \> \> \> $\bullet$ weight=0 if interior backward arc. \\
    \> \> \> $\bullet$ weight= $\Delta(T) - num\_child(v)$ if last backward arc. \\
2.  \> \> Scan sum weights by parallel pointer jumping. \\
3.  \> \> Take summed weight modulo $\Delta(T)$ on forward arcs. 
\end{tabbing}
\label{labfigcolr}
\caption{\sl Coloring Algorithm.}
\end{figure}
%\begin{tabular}{l}
%\hspace{6.7in} \\ \hline
%\end{tabular}
  
\par
 Step 1 of the above algorithm takes $O(\Delta(T))$ time since each processor has 
to traverse its list of neighbors to make weight assignment. 
The maximum number of neighbors 
is given by the maximum degree $\Delta(T)$.  Step 2 involves efficient
parallel scan operation. Scan operations take logarithmic time if each processor
stores only one item to be scanned. In the present case, each processor stores
$\Delta(T)$ items and hence this list must be traversed during each
step of the scanning. Hence step 2 takes $O(\Delta(T) \cdot \log |V|)^1$. 
The complexity of step 3 is the same as step 1.  Hence the overall 
coloring algorithm has $O(\Delta(T) \cdot \log |V|)^1$ parallel complexity for each tree.
Figure~\ref{labfig5} shows the coloring algorithm applied to one of the 
trees in the  load redistribution example. Figure~\ref{labfig5}(a) shows 
the weight assignment on the tree, (b) shows the three steps of the 
algorithm on the linearized tree and (c) shows the resulting coloring assignment
on the edges of the tree.

\begin{figure}[htb]
 \begin{center}
\makebox[6.694in][l]{
  \vbox to 4.833in{
    \vfill
    \special{psfile=fig6.ps}
  }
  \vspace{-\baselineskip}
}
  \end{center}
  \caption{ \sl Example (a) Weight assignment (b) Algorithm steps 
      (c) Edge coloring obtained. }
\label{labfig5}
\end{figure}

\subsubsection*{Correctness of the Algorithm}

An initial lemma is stated about the summation in a subtree of the weights 
assigned by 
the coloring algorithm and then this lemma is used to establish the correctness
of the coloring algorithm.

   Let set $E_f'$ consist of the forward edges,
$E_i'$ the interior backward edges and $E_l'$, the last backward edges.
Also let the non-leaf vertices of $T$ be given by 
$R(T) = \{ v | v \in V \hspace{0.1in} and\hspace{0.1in} num\_child(v) \ne 0 \}$.

\newtheorem{guess}{Lemma} 
\begin{guess}
 Given an Euler tour $T_{Euler}(V,E')$ of a rooted tree $T$, an 
integer constant $c \ge \Delta(T)$ and the assignment of weights 
to the arcs of $T_{Euler}$ 
  
\[
w(e) = \left\{ \begin{array}{ll} 
 $1$   & \mbox{if $e \in E_f' $ } \\
 $0$   & \mbox{if $e \in E_i' $} \\
$c-num\_child(v)$ & \mbox{if $e \in E_l' $}
        \end{array}
           \right.
\]

then 
\begin{equation}
\sum_{e \in E'} w(e) = c \cdot | R(T) | 
\end{equation}
\end{guess}

\noindent
{\it Proof:} 
   Simply summing the weights of all the edges gives: 

\begin{eqnarray*}
\sum_{e \in E'} w(e)  & = & 
\sum_{e \in E_f'} w(e) + \sum_{e \in E_i'} w(e) + \sum_{e \in E_l'} w(e) \\
                      & = & |V| - 1 + \sum_{e \in E_l'} (c - num\_child(v) ) \\
                      & = & |V| - 1 + c \cdot |R(T)| - \sum_{e \in E_l'} num\_child(v) \\
                      & = & c \cdot |R(T)| \\
\end{eqnarray*}
\hspace{6.4in}$\Box$

 We establish the correctness of the algorithm with the following theorem. \\
  \\
\noindent 
{\bf Theorem 1}
   The algorithm given in Figure~5 edge-colors a tree $T$
using $\Delta(T)$ colors. \\

\noindent
{\it Proof:}
   Note that each of
the edges incident on a vertex should have a different number assigned to it.
It suffices  to examine the non-leaf vertices. Let the summed weights
be denoted by $w'$. 
  Given  $u \in R(T)$, there are  forward arcs to the children 
$<u,v_i>$ $i=1,\ldots,num\_child(u)$. If  $u$ is not the root of the 
tree, the forward arc from the parent $<parent(u),u>$.  It remains to be shown
that  the summed weight modulo $\Delta$ is different for each of these
arcs.   First this is shown for the arcs $<u,v_i>$ $i=1,\ldots,num\_child(u)$. 

 Let the $w'(<v_{num\_child(u)},u>) = s $. Then the summed weights for 
children are given as:
\[
w'(u,v_i) = s + num\_child(u) - i + 1 + \sum_{j=i}^{num\_child(u)}\Delta(T) \cdot |R(T_i)| \hspace{0.5in} i=1,\ldots,num\_child(u)
\]

where $T_i$ denotes the subtree rooted by $v_i$ and the last term is 
obtained using the Lemma. It follows that each edge color is given by:

\[
color(<u,v_i>) = w'(<u,v_i>) \pw{mod} \Delta(T) = (s+ num\_child(u)-i +1) \pw{mod} \Delta(T) 
\]

 There are two cases: 1) $num\_child(u) = \Delta(T)$  which can occur if
$u$ is the root of $T$ and $degree(u)= \Delta(T)$,   and 
2) $num\_child(u) < \Delta(T)$. In the first case, there is no 
$<parent(u),u>$ arc, so $(s+num\_child(u)-i+1) \pw{mod} \Delta $  \hspace{0.1in} for $i=1,\ldots,\Delta $ is 
distinct for the forward arcs.  In the second case, $u$ can have a parent
or be the root in which case $(s+num\_child(u) -i+1) \pw{mod} \Delta $ 
\hspace{0.1in} for $1 \le i \le num\_child(u) < \Delta $ which 
is distinct for all $<u,v_i>$. If $u$ has a parent i.e. there is an arc
$<parent(u),u>$, then this edge gets $(s+num\_child(u)+1) \pw{mod} \Delta$
as the color which is distinct from the children's color. Thus 
since forward arcs dictate the color, then all the edges incident on a 
vertex have distinct colors. 

\subsection{Criteria for Selecting Elements to Transfer}
\label{labelemmove}
  
   Since the work per element can vary with adaptive methods, the 
selection criteria for deciding which elements to move becomes more 
complex.  
 Consider one of the element movement criteria proposed by Lohner 
\cite{lohner} as shown in Figure~\ref{labfig6}.  To prevent noisy
partition boundaries in (b), elements surrounding one of the vertices of
the boundary edge are transferred as shown in (c).  This approach however
may pose problems with p-refinement. Since the 
degrees of freedom on the edges and/or in the interior of an element can 
be increased by p-refinement, the work as well as the 
communication requirements becomes spatially nonuniform.
Hence, choosing elements around a boundary vertex   is 
not always be the best criteria. Figure~\ref{labfig6} (d) illustrates 
an example in which some of the elements have two degrees of 
freedom on their edges in addition to their nodal support.  
In this case 
selecting the elements which will cause noisy boundaries will be 
a better choice, because it decreases the communication volume.
Figures~\ref{labfig6}(e) and (f)  show the total
number of degrees of freedom on the partition boundary in each case.
\begin{figure}[htb]
 \begin{center}
\makebox[5.500in][l]{
  \vbox to 3.180in{
    \vfill
    \special{psfile=fig7.ps}
  }
  \vspace{-\baselineskip}
}
  \end{center}
  \caption{ \sl Lohner's[10] example and selection criteria (a)-(c). The
 case when varying number of degrees of freedom are present on the edges (d)-(f).}
\label{labfig6}
\end{figure}

 The most important criteria for element selection are:
\begin{enumerate}
\item Element selection should be done fast since it will be called 
      for each element to be moved. A traversal of the list
      of partition boundary edges to locate the next element to move is 
      required.
\item As the example of Figure~\ref{labfig6} illustrates,
      the number of degrees of freedom on the partition boundary should
       be kept minimal. This should be done to reduce communication
       volume.
\item Elements with a larger load should be favored since this 
      will imply fewer element transfers and faster 
      convergence to load balance.
\end{enumerate}


The current criteria for selecting elements is similar to the approach 
in Wheat et al.~\cite{wheat}
which prioritizes the boundary elements. 
Each partition boundary element has a cost associated with it based on:
\begin{enumerate}
\item  The element workload which is the total 
       number of degrees of freedom in the interior, on the edges and 
      the nodes of the element.
\item  The difference in communication cost that will result if the
       element is moved. This is equal to the total number of 
     degrees of freedom on the edges and the nodes that will be 
     exposed to the partition boundary.
\item The previous moves. This can be optionally used to favor
    selecting elements which are adjacent to a previously moved element.
\end{enumerate}


 The above three pieces of information are represented as a 
$3-tuple$  $(L_e,D_e,P_e)$ and used as a cost indicator when moving 
the elements.  The parameter $L_e$ refers to the workload on the element and it is 
the most significant part of the cost. 
The higher $L_e$, the higher the priority of the element to move.
The reason for assigning the heaviest weight to the  workload on 
the element is to facilitate greater reduction in the imbalance
between the processors when an element is moved.   If all the elements
have equal work, $L_e$ is set to unity.  In this case, 
the number of elements is balanced between the partitions. The parameter $D_e$ 
refers to the difference
in the communication cost and is the second most significant part.
The communication volume might decrease 
in which case $D_e>0$ or remain the same $D_e=0$ or increase
$D_e<0$ when the element is moved.  Finally, the least significant $P_e$ 
part can be used as a history mechanism for the previous moves.  
   When an element is  moved, the $P_e$ field of neighboring elements 
can be marked by $1$ so that
when the next element is to be moved, and both $L_e$ and $D_e$ are the 
same, the $P_e$ can be used to break ties. In this case, since all
parameters are equal, the marked $P_e$ will enable the element which
was the neighbor of a previously moved element to be favored.
Given this cost assignment, the next element to move is then chosen as the one
having the maximum $(L_e,D_e,P_e)$ cost.  We illustrate the 
use of 3-tuple cost template by applying it to the example mesh in 
Figure~\ref{labfig7}(a-c). 
For simplicity it is assumed that the number of edges
gives the communication cost and each element has a workload value of 1. 
As a result boundary elements 2, 4, and 6 
which are candidates for moving all  have cost $(1,-1,0)$.  In this case,
the load is $L_e=1$. The 
communication cost is given by $D_e=1-2$ which is the difference between the number 
of currently exposed edges to the number if the element were moved. 
Finally $P_e$ is set to $0$ indicating no neighbor has been moved yet. Since all
costs are equal, an element will be chosen arbitrarily, which is taken to be 
element 2.
The newly exposed elements 1 and 3 are inserted into the partition boundary 
element list and their costs calculated. The elements 1 and 3 also
have their $P_e$ marked as 1 since they are neighbor of a moved element.
This field is then used to favor their movement in case ties arise
in the other $L_e$ and $D_e$ costs.   This process is repeated until
all three elements have been moved.

\begin{figure}[htb]
 \begin{center}
\makebox[4.458in][l]{
  \vbox to 3.736in{
    \vfill
    \special{psfile=fig8.ps}
  }
  \vspace{-\baselineskip}
}
  \end{center}
  \caption{ \sl Example showing how the cost template is used to  move 
three elements.} 
\label{labfig7}
\end{figure}

\subsection{Determining Amount of Load to Send and Convergence}
\label{labconvergence}

  Suppose a parent processor with load value $L_0$  has
$m$ load requesting offspring with load values $L_i$ 
$i=1,\ldots,m$ as shown in Figure~\ref{labfig4}(a). 
The criteria for determining how much load to transfer is as follows:
 Each child requests an 
amount $r_i$ which is equal to the difference from its current load to 
the average of its and its parent's load, i.e.

\begin{equation}
   r_i = \lceil (L_0-L_i)/2 \rceil .
\label{eqreq}
\end{equation}

\noindent
The parent processor will decide to send a total amount which will make its load 
become the average of the loads $L_i$ $i=0,\ldots,m$ :

\[
  to\_send_{tot} = L_0 - \frac{\sum_{i=0}^m L_i}{m+1} .
\]

\noindent 
The parent determines the individual amounts $to\_send_i$ to transfer to 
children in proportion to the their load request: 

\[
 to\_send_i= min\{ r_i, to\_send_{tot} \cdot \frac{r_i}{\sum_{j=1}^{m} r_j} \}.
\]

\noindent 
 The minimum of the two values is taken in order to prevent transferring loads greater 
than the requested load. Figure~~\ref{labfig4}(b) shows the load 
requests and load grants for a subtree in the redistribution example.

\begin{figure}[htb]
 \begin{center}
\makebox[4.527in][l]{
  \vbox to 1.625in{
    \vfill
    \special{psfile=fig9.ps}
  }
  \vspace{-\baselineskip}
}
  \end{center}
\caption{ \sl (a) Load request $r_i=\lceil (L_0-L_i)/2 \rceil$ from sender (b) example of
transferred amounts}
\label{labfig4}
\end{figure}

   Two important issues that should be addressed in an iterative load balancing 
algorithm are:
\begin{itemize}
\item {\it Convergence:} As the load is transferred iteratively
                         between the processors, the load value on each 
                         processor should converge to the balanced average 
                         load in a finite number of steps. 
\item {\it Oscillations:} There should not be any indefinite cycles 
(repeating load transfer patterns) while the system is imbalanced.
\end{itemize}

 Leiss/Reddy~\cite{leiss} prove the following results in~\cite{leiss}. Let an
$H-neighborhood$ denote the neighbors of a processor within a distance
of $H$, $C$ denote the load treshold value. If elements are taken as 
load units, 
then $C=1$. Also let $d$ indicate the {\it diameter}
(the maximum of all the shortest paths between any two nodes) 
of the processor graph which in the present case is the partition graph. Finally,
define H-neighborhood imbalance at time $t$  as the variance
\[
   GIMB_H^t = \sum_{p \in P} (L(p) - \alpha)^2.
\]
Here $P$ is the processor set, $L(p)$, the load value on each processor and
$\alpha$ the average load value per processor.  The results proved are:

\begin{enumerate}
\item after a rebalancing iteration $GIMB_H^{t+1} < GIMB_H^{t}$ and 
\item after balancing terminates, the maximum imbalance in the whole 
      system is bounded by: $ \lceil \frac{d}{2H} \cdot C \rceil $.
\end{enumerate}

According to the first result, the imbalance in the 
{\it neighborhood} and {\it not} necessarily the whole system will reach a 
minimum since the $GIMB$ is decreased after each iteration. The second result
states that if the system is neighborhood-balanced, the whole system can 
still be severely imbalanced.  An example illustrating this worst case
scenario  is the configuration with $n$ processors forming a  1 dimensional 
chain and each having a load that differs from neighbor only by 
$L_{i+1} - L_i=1$, i.e. a load ramp. If $H=1$ and $C=1$ and since $d=n$, then 
the imbalance after termination of the algorithm will be $n/2$. 
Increasing the neighborhood measure $H$ to $n/2$ will balance the system
globally. However, $H=n/2$ will require {\it each} of the $n$ processors to 
send messages to the $n/2$ H-neighbors. Hence choosing 
$H=n/2$ is impractical. In general, the case  $H>1$ will increase the 
communication volume and hence make the iterative balancing algorithm
inefficient. 

  To avoid this problem with the Leiss/Reddy approach while keeping $H=1$,
two modifications are made to handle the case when the load difference between
the neighboring processors is $C$. Unlike~\cite{devine} which 
sends at most $\lfloor (L_0-L_i)/2 \rfloor $ and considers the 
$L_0 - L_i=1$ as balanced, the current procedure  exchanges the excess load as given in 
Eq~(\ref{eqreq}) even if the $GIMB$ will remain the same.
Hence, it allows the case when $GIMB_H^{t+1} \le GIMB_H^{t}$. 
The second modification involves, 
storing the previous move and making sure that the same excess load will not 
be transferred back to its original holder.  This is performed in 
order to prevent oscillations between two neighboring processors i.e.
oscillation on a cycle of length 2.  Even though the first modification
fixes the problem of premature termination without global load balancing, the
second modification does not take care of detecting oscillations which
can occur in cycles of length greater than 2. 

\section{Results}
 The iterative redistribution heuristic was tested on
the massively parallel MasPar MP-2 system with a torus connected
architecture and SIMD style of computation. 
Up to 2048 processors 
were used in the test cases with each processor having 64K bytes of memory. 
The Maspar system provides two types of communication mechanisms:
The restrictive {\it xnet} mechanism provides a fast communication  between 
the processors arranged in a mesh topology. The {\it router} provides
a slower general purpose communication among any pair of processors.
Since our applications involved highly unstructured meshes with arbitrary
curved boundaries, the communication requirements  between
mapped partitions were highly irregular. Hence the general router
communication was chosen in the implementation.

  Four test cases involving meshes on a square and an irregular curved boundary
were run.  Starting with a coarse mesh, {\it ORB} method~\cite{berger2}
 was used to get an initial partition sequentially. The partitioned 
mesh was then mapped onto the processors and refined selectively in 
parallel to create imbalanced processor loads. The square mesh 
was refined in one corner  to create a 'plateau' of high load distribution.
As the neighbouring load transfers progress, the plateau evolves
to the difficult redistribution example involving a ramp load distribution.
 Table~\ref{meshtest} 
shows various statistics for the test cases.  In square1, a small
mesh with 16 processors were employed. In square2 and square3 
2048 processors were employed with refinement in 4 and
16 processors respectively in the upper right corner of the mesh
as shown in Figure~11(a).  The final test involved a highly
unstructured mesh with a curved boundary (Figure~12).

{\small
\begin{table}[htbp]
\begin{center}
\begin{tabular}{|l|c|c|c|c|c|c|c|c|c|} \hline
     &  Number     &  Number    & Average           &
\multicolumn{2}{c|}{Load}  & 
\multicolumn{2}{c|}{Max boundary}  &
Number &  \\
     &  of  &  of   &  elements per  &   
\multicolumn{2}{c|}{ (Min,Max)} & 
\multicolumn{2}{c|}{ edges}  &
 of & Time \\
Test & elements   & processors  &  per processor     & before & after &  before 
& after & 
iterations & (secs) \\ \hline \hline 
square1 & 164       &  16        &  10.25          &  2, 32 & 7,11   & 
16 & 12 & 25 & 9.7 \\ \hline 
square2 &  32904    & 2048       &  16.06           &  16,52  & 16,18  & 
24 &  21 & 63 & 33.6 \\ \hline
square3 & 33300     & 2048       & 16.25            & 16,52  & 16,18  & 
24  & 25  & 398 & 214.9 \\ \hline
curved &  1008     & 32         &  31.5            & 18,47  & 31,32 &  
22 & 25 & 25 & 12.3 \\ \hline
\end{tabular}
\end{center}
\caption{Test cases and various statistics before and after convergence to 
          load balance}
\label{meshtest}
\end{table}
}

 
 Table~\ref{meshtest}, lists the average number of  
elements per processor, the maximum and minimum loads  
and the maximum number of edges located on
the boundary of the partitions before and after the  redistribution
algorithm has been run until convergence to optimal load balance.
Tests square1, square2 and curved shows good 
convergence results. Test square3, on the other hand, shows 
slow convergence even though the number of elements and the maximum
imbalance  is similar to the square2 test.  This slow
convergence can be explained as follows:
Since a  
ramp evolves during redistribution, the amount of load that can be 
transferred from the high load plateau to the less loaded processors is
small. In the worst case in which each processor arranged on a one dimensional ramp has a load
difference of one from its neighbour, the maximum number of elements 
that can be transferred per iteration is one. Hence, the bigger the 
excess load to be migrated from the plateau, the slower the convergence
of the redistribution algorithm. 
  The average load and the distance the elements have to travel
is approximately the same in both square2 and square3 test cases. Since
square3 has four times the excess load of square2, we would expect
the number of iterations of square3 to be around four times that
of square2. The convergence history shown in Figure~11 (b) and (c) point to 
this effect.
 
We also notice that the number of boundary edges which represents the volume of
communication associated with a partition does not necessarily decrease 
after the redistribution. Whereas tests square1 and square2 showed 
a slight reduction, square2 and curved meshes showed an increase in the number 
 of boundary edges after balancing.  Hence, even 
though the redistribution algorithm performs well in reducing the 
imbalance, the selection criteria used for element migration does not guarantee 
reduction of communication costs. 
   
\begin{figure}[htb]
\centerline{
\vbox{\psfig{file=fig10a.ps,width=1.5in,height=1.5in}}
\vbox{\psfig{file=blank.ps,width=0.2in,height=1.5in}}
\vbox{\psfig{file=fig10b.ps,width=1.5in,height=1.5in}}
\vbox{\psfig{file=blank.ps,width=0.2in,height=1.5in}}
\vbox{\psfig{file=fig10c.ps,width=2.5in,height=1.5in}}
}
\centerline{\hspace{1.12in} (a) \hspace{1.6in} (b) \hspace{2.0in}
 (c) \hspace{1.27in} }
\label{labtest1}
\caption{\sl Test square1: (a) unbalanced load after mesh refinement, (b) after
               redistribution and (c) convergence history.}
\end{figure}
 
\begin{figure}[htb]
\centerline{
\vbox{\psfig{file=fig11a.ps,width=1.5in,height=1.5in}}
\vbox{\psfig{file=blank.ps,width=0.2in,height=1.5in}}
\vbox{\psfig{file=fig11b.ps,width=2.0in,height=1.5in}}
\vbox{\psfig{file=blank.ps,width=0.2in,height=1.5in}}
\vbox{\psfig{file=fig11c.ps,width=2.0in,height=1.5in}}
}
\centerline{\hspace{1.12in} (a) \hspace{1.6in} (b) \hspace{2.0in}
 (c) \hspace{1.27in} }
\label{labtest2}
\caption{\sl (a) Test mesh for square2 and square3, (b) and (c) 
  corresponding convergence histories.}
\end{figure}
%%

\begin{figure}[htb]
\centerline{
\vbox{\psfig{file=fig12a.ps,width=1.2in,height=1.5in}}
\vbox{\psfig{file=blank.ps,width=0.15in,height=1.5in}}
\vbox{\psfig{file=fig12b.ps,width=1.2in,height=1.5in}}
\vbox{\psfig{file=blank.ps,width=0.15in,height=1.5in}}
\vbox{\psfig{file=fig12c.ps,width=1.2in,height=1.5in}}
\vbox{\psfig{file=blank.ps,width=0.15in,height=1.5in}}
\vbox{\psfig{file=fig12d.ps,width=2.1in,height=1.5in}}
}
\centerline{\hspace{0.60in} (a) \hspace{1.33in} (b) \hspace{1.30in}
 (c) \hspace{1.65in} (d) \hspace{0.9in} }
\label{labtest3}
\caption{\sl Test curved (a) initial ORB partitioned coarse mesh, 
(b) unbalanced mesh after refinement, 
(c) balanced mesh after redistribution and (d) convergence history.}
\end{figure}

  The convergence history plots given in 
Figures~10-12
show a sharp drop in the imbalance  within the first few iterations
and slower drops in later iterations. 
   Table~\ref{testtime} further present the performance of the 
heuristic by listing the number of iterations and the cpu time
required to achieve 50\%,75\% and 90\% reduction in the original 
imbalance.  Since higher percentage reductions require far more
iterations, a trade-off can be established between the time 
to do computation with an imbalanced load and the time needed 
to achieve further reductions in imbalance.  As a result,
the  redistribution process can be stopped at a smaller number of iterations. 

{\small
\begin{table}[htbp]
\begin{center}
\begin{tabular}{|c||c|c|c||c|c|c|} \hline
 & \multicolumn{6}{c|}{Percent Reduction in Imbalance}  \\ 
 & \multicolumn{3}{c}{Iterations} & \multicolumn{3}{c|}{Time(secs)} \\ 
Test  & 50 \% & 75 \% & 90 \% & 50 \% & 75 \% & 90 \% \\ \hline \hline
square1 & 4   & 5   & 10   & 5.2  & 5.7    & 7.6  \\ \hline
square2 & 4   & 11  & 30   & 7.1  & 12.7   & 21.9   \\ \hline
square3 & 13  & 29  & 98   & 28.6 & 47.7   & 92.8  \\ \hline
curved  & 2   & 4   & 9    & 4.4  &  6.4   & 8.9  \\ \hline
\end{tabular}
\end{center}
\caption{Iterations and cpu times to achieve various percent reductions 
         in imbalance.}
\label{testtime}
\end{table}
}


\section{Conclusion}
  This paper presented a procedure which performs parallel 
refinement and mesh redistribution  to be used for adaptive finite 
element environments.  The redistribution algorithm was based 
on the Leiss/Reddy heuristic and offered modifications to prevent possible
premature termination.  
A new tree edge coloring algorithm was presented and used to synchronize the 
load transfers between processors. The whole system was tested on various
meshes  and showed good convergence results. 

  There are however some open problems in the load balancing scheme
that has been used. First, the convergence {\it rate} for the Leiss/Reddy 
heuristic has not been established 
yet. This is particularly difficult to determine since
the partition graph changes dynamically.  Secondly, even though the 
 modifications offered resolve oscillation on cycles of length 2,
it is still not known how to efficiently detect oscillations on cycles
of length greater than 2. Some state information has to be 
stored to detect such oscillations. The tests with a ramp load 
distribution which was vulnerable for oscillation did converge to the 
average load. However, this does not ensure that the problem cannot arise.
Finally, the presented coloring algorithm 
makes it possible to use other element selection criteria to be used.
In particular, coloring of processors enables 
the recursive subdivision techniques  to be used 
locally on paired processors within the context of the Leiss/Reddy load 
balancing scheme. 

\section*{Acknowledgement}
This research was supported by the U.S. Army Research Office under 
Contract Number DAAL-03-91-G-0215 and the National Science Foundation
under Grant Number CCR 9216053.
The results presented in the paper were based on the computations performed
on the MasPar MP-2 system located at Ames Laboratory, Iowa State University.

{
\footnotesize

\begin{thebibliography}{10}

\bibitem{BANK81}
R.~E. Bank and A.~H. Sherman.
\newblock An adaptive multi-level method for elliptic boundary value problems.
\newblock {\em Computing}, 26:91--105, 1981.

\bibitem{berger2}
M.~J. Berger and S.~H. Bokhari.
\newblock A partitioning strategy for nonuniform problems on multiprocessors.
\newblock {\em IEEE Transactions on Computers}, C-36(5):570--580, May 1987.

\bibitem{shadid}
S.H. Bokhari, T.W. Crockett, and D.~M. Nicol.
\newblock Parametric binary dissection.
\newblock Technical Report ICASE 93-39, ICASE, NASA Langley Res. Ctr.,Hampton,
  July 1993.

\bibitem{devine}
K.D. Devine, J.E. Flaherty, S.~R. Wheat, and A.~B. Maccabe.
\newblock A massively parallel adaptive finite element method with dynamic load
  balancing.
\newblock Technical Report SAND 93-0936C, Sandia National Labs, Albuquerque,
  1993.

\bibitem{fiedler}
M.~Fiedler.
\newblock Algebraic connectivity of graphs.
\newblock {\em Czechoslovak Math. J.}, 23:298--305, 1973.

\bibitem{flaherty}
J.E. Flaherty, P.J. Paslow, M.S. Shephard, and J.D. Vasilakis, editors.
\newblock {\em Adaptive Methods for Partial Differential Equations}.
\newblock SIAM Proceedings Series, Philadelphia, 1989.

\bibitem{hammond}
S.~W. Hammond.
\newblock {\em Mapping Unstructured Grid Computations to Massively Parallel
  Computers}.
\newblock PhD thesis, Computer Science Dept., Rensselaer Polytechnic Institue,
  Troy, 1991.

\bibitem{JaJa}
J.~JaJa.
\newblock {\em An Introduction to Parallel Algorithms}.
\newblock Addison-Wesley Publishing Company, Reading, 1992.

\bibitem{leiss}
E.~Leiss and H.~Reddy.
\newblock Distributed load balancing: Design and performance analysis.
\newblock {\em W. M. Keck Research Computation Laboratory}, 5:205--270, 1989.

\bibitem{lohner}
R.~Lohner and R.~Ramamurti.
\newblock A parallelizable load balancing algorithm.
\newblock In {\em Proc. of the AIAA 31st Aerospace Sciences Meeting and
  Exhibit}, Reno, 1993.

\bibitem{pothen}
A.~Pothen, H.~Simon, and K.~Liou.
\newblock Partitioning sparse matrices with eigenvectors of graphs.
\newblock {\em SIAM J. Matrix Anal. Appl.}, 11(3):430--452, July 1990.

\bibitem{RIVA84}
Maria-Cecilia Rivara.
\newblock Algorithms for refining triangular grids suitable for adaptive and
  multigrid techniques.
\newblock {\em International Journal for Numerical Methods in Engineering},
  20:745--756, 1984.

\bibitem{shephard}
M.S. Shephard and M.K Georges.
\newblock Automatic three-dimensional mesh generation by the finite octree
  technique.
\newblock {\em Int. J. Numer. Meth. Engng}, 32:709--749, 1991.

\bibitem{szabobook}
B.A. Szabo and I.~Babuska.
\newblock {\em Introduction to Finite Element Analysis}.
\newblock John Wiley, New York, 1991.

\bibitem{szymansk}
B.~K. Szymanski and A.~Minczuk.
\newblock A representation of a distribution power network graph.
\newblock {\em Archiwum Elektrotechniki}, 27(2):367--380, 1978.

\bibitem{vizing}
V.G. Vizing.
\newblock On an estimate of a chromatic class of a multigraph.
\newblock In {\em Proc. Third Siberian Conf. on Mathematics and Mechanics},
  Tomsk, 1964.

\bibitem{berzins}
C.~Walshaw and M.~Berzins.
\newblock Dynamic load-balancing for pde solvers on adaptive unstructured
  meshes.
\newblock Technical Report Preprint, School of Computer Studies, University of
  Leeds, Leeds, 1992.

\bibitem{WEBS93a}
B.~E. Webster, M.~S. Shephard, and Z.~Rusak.
\newblock Unsteady compressible airfoil aerodynamics using an automated
  adaptive finite element method.
\newblock {\em AHS Journal}, Submitted, 1993.

\bibitem{weil85}
K.~J. Weiler.
\newblock Edge based data structures for solid modeling in curved-surface
  environments.
\newblock {\em IEEE Computer Graphics and Applications}, 5(2), 1985.

\bibitem{wheat}
S.~R. Wheat, K.~D. Devine, and A.~B. Maccabe.
\newblock Experience with automatic, dynamic load balancing and adaptive finite
  element computation.
\newblock Technical Report SAND 93-2172A, Sandia National Labs, Albuquerque,
  1993.

\bibitem{williams}
R.D. Williams.
\newblock Performance of dynamic load balancing algorithms for unstructured
  grid calculations.
\newblock Technical Report C3P913, Pasadena, 1990.

\end{thebibliography}
}
\clearpage
\listoffigures
\thispagestyle{empty}
\end{document} 

