This data structure (the "Problem" struct) stores the graph topology in CSR format and the frontier. All Problem structs inherit from the ProblemBase struct. Algorithm-specific data is stored in a "DataSlice".
template<
typename VertexId,
typename SizeT,
typename Value,
bool _MARK_PREDECESSORS, // Whether to mark predecessor ID when advance
bool _ENABLE_IDEMPOTENCE, // Whether to enable idempotence when advance
bool _USE_DOUBLE_BUFFER>
struct BFSProblem : public ProblemBase<VertexId, SizeT, _USE_DOUBLE_BUFFER>
{
MARK_PREDECESSORS sets the predecessor node ID during a traversal for each node in the new frontier.
static const bool MARK_PREDECESSORS = _MARK_PREDECESSORS;
ENABLE_IDEMPOTENCE is an optimization when the operation performed in parallel for all neighbor nodes/edges is idempotent, meaning data races are benign.
static const bool ENABLE_IDEMPOTENCE = _ENABLE_IDEMPOTENCE;
The DataSlice struct stores per-node or per-edge arrays and global variables (if any) that are specific to this particular algorithm. Here, we store the depth value and predecessor node ID for each node.
struct DataSlice
{
VertexId *d_labels; // BFS depth value
VertexId *d_preds; // Predecessor IDs
};
SizeT nodes;
SizeT edges;
DataSlice *d_data_slices;
The constructor and destructor are ignored here.
"Extract" copies labels and predecessors back to the CPU.
cudaError_t Extract(VertexId *h_labels, VertexId *h_preds)
{
cudaError_t retval = cudaSuccess;
if (retval = util::GRError(CopyGPU2CPU(data_slices[0]->d_labels, h_labels, nodes))) break;
if (retval = util::GRError(CopyGPU2CPU(data_slices[0]->d_preds, h_preds, nodes))) break;
return retval;
}
The Init function initializes this Problem struct with a CSR graph that's stored on the CPU. It also initializes the algorithm-specific data, here depth and predecessor.
cudaError_t Init(
const Csr<VertexId, Value, SizeT> &graph)
{
cudaError_t retval = cudaSuccess;
if (retval = util::GRError(ProblemBase::Init(graph))) break;
if (retval = util::GRError(GPUMalloc(data_slices[0]->d_labels, nodes))) break;
if (retval = util::GRError(GPUMalloc(data_slices[0]->d_preds, nodes))) break;
return retval;
}
The Reset function primes the graph data structure to an untraversed state.
cudaError_t Reset(
const Csr<VertexId, Value, SizeT> &graph, VertexId src)
{
cudaError_t retval = cudaSuccess;
if (retval = util::GRError(ProblemBase::Reset(graph))) break;
Set all depth and predecessor values to invalid. Set the source node's depth value to 0.
util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_labels, INVALID_NODE_VALUE, nodes);
util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_preds, INVALID_PREDECESSOR_ID, nodes);
if (retval = util::GRError(CopyGPU2CPU(data_slices[0]->d_labels+src, 0, 1)));
Put the source node ID into the initial frontier.
if (retval = util::GRError(CopyGPU2CPU(g_slices[0]->ping_pong_working_queue, src, 1)));
return retval;
}
};
bfs_functor defines user-specific computations with (1) two per-edge functors, CondEdge and ApplyEdge, which will be used in the Advance operator; and (2) two per-node functors, CondVertex and ApplyVertex, which will be used in the Filter operator.
template<typename VertexId, typename SizeT, typename Value, typename ProblemData>
struct BFSFunctor {
typedef typename ProblemData::DataSlice DataSlice;
__device__ bool CondEdge(VertexId s_id, VertexId d_id, DataSlice *p)
{
if (ProblemData::MARK_PREDECESSORS)
Set predecessor for each destination node. We set the depth later, because we only want to set the depth for valid nodes.
return (atomicCAS(&p->d_preds[d_id], INVALID_PREDECESSOR_ID, s_id) == INVALID_PREDECESSOR_ID)
? true : false;
else
If we're not keeping track of predecessors, we can immediately set the depth of the destination vertex to one plus the source vertex's depth.
return (atomicCAS(&p->d_labels[d_id], INVALID_NODE_VALUE, s_id+1) == INVALID_NODE_VALUE)
? true : false;
}
ApplyEdge here increments the depth value.
__device__ void ApplyEdge(VertexId s_id, VertexId d_id, DataSlice *p)
{
if (ProblemData::MARK_PREDECESSORS)
We know the destination node is valid (from CondEdge), so here we set its depth to one plus the source vertex's depth.
p->d_labels[d_id] = p->d_labels[s_id]+1;
}
In BFS, CondVertex checks if the vertex is valid in the next frontier.
__device__ void CondVertex(VertexId node, DataSlice *p)
{
return node != INVALID_NODE_ID;
}
In BFS, we don't apply any actions to vertices.
__device__ void ApplyVertex(VertexId node, DataSlice *p)
{
}
};
The enactor defines how a graph primitive runs. It calls traversal (advance and filter operators) and computation (functors).
class BFSEnactor : public EnactorBase {
For BFS, Constructor, Destructor, and Setup functions are ignored
template<
typename AdvancePolicy,
typename FilterPolicy,
typename BFSProblem>
cudaError_t EnactBFS(CudaContext &context,BFSProblem *problem,VertexId src)
{
typedef BFSFunctor<
typename BFSProblem::VertexId,
typename BFSProblem::SizeT,
typename BFSProblem::VertexId,
BFSProblem> BfsFunctor;
Start with the Setup function to initialize kernel running parameters
cudaError_t retval = cudaSuccess;
if (retval = EnactorBase::Setup(problem)) break;
Define the graph topology data pointer (g_slice) and the problem-specific data pointer (d_slice)
typename BFSProblem::GraphSlice *g_slice = problem->d_graph_slices;
typename BFSProblem::DataSlice *d_slice = problem->d_data_slices;
Initialize the queue length (frontier size) to 1.
SizeT queue_length = 1;
We ping-pong between old and new frontiers; "selector" picks which one is the current destination.
int selector = 0;
Here we sequence our operators and functors. For BFS, we alternate between advancing to a new frontier (vertex-to-vertex), calling the BFS functor to set depths along the way, and then filtering out invalid nodes from the new frontier. We repeat until the frontier is empty.
while (queue_length > 0) {
gunrock::oprtr::advance::Kernel
<AdvancePolicy, BFSProblem, BFSFunctor>
<<<advance_grid_size, AdvancePolicy::THREADS>>>(
queue_length,
g_slice->ping_pong_working_queue[selector],
g_slice->ping_pong_working_queue[selector^1],
d_slice,
context,
This advance is vertex-to-vertex
gunrock::oprtr::advance::V2V);
selector ^= 1; // Swap selector
gunrock::oprtr::filter::Kernel
<FilterPolicy, BFSProblem, BFSFunctor>
<<<filter_grid_size, FilterPolicy::THREADS>>>(
queue_length,
g_slice->ping_pong_working_queue[selector],
g_slice->ping_pong_working_queue[selector^1],
d_slice);
}
return retval;
}
The entry point in the driver code to BFS is this Enact call.
template <typename BFSProblem>
cudaError_t Enact(
CudaContext &context,
BFSProblem *problem, // Problem data sent in
typename BFSProblem::VertexId src) // Source node ID for BFS
{
Gunrock provides recommended settings here for kernel parameters, but they can be changed by end-users.
typedef gunrock::oprtr::filter::KernelPolicy<
BFSProblem,
300, //CUDA_ARCH
8, //MIN_CTA_OCCUPANCY
8> //LOG_THREAD_NUM
FilterKernelPolicy;
typedef gunrock::oprtr::advance::KernelPolicy<
BFSProblem,
300,//CUDA_ARCH
8, //MIN_CTA_OCCUPANCY
10, //LOG_THREAD_NUM
32*128> //THRESHOLD_TO_SWITCH_ADVANCE_MODE
AdvanceKernelPolicy;
return EnactBFS<AdvanceKernelPolicy, FilterKernelPolicy, BFSProblem>(context, problem, src);
}
};
This data structure (the "Problem" struct) stores the graph topology in CSR format and the frontier. All Problem structs inherit from the ProblemBase struct. Algorithm-specific data is stored in a "DataSlice".
template<
typename VertexId,
typename SizeT,
typename Value>
struct SALSAProblem : public ProblemBase
{
static const bool MARK_PREDECESSORS = true; // for SALSA algorithm, we need to track predecessors in Advance
static const bool ENABLE_IDEMPOTENCE = false; // In SALSA, data races during Advance are not allowed.
struct DataSlice
{
Value *d_hrank_curr; // hub rank score for current iteration
Value *d_arank_curr; // authority rank score for current iteration
Value *d_hrank_next; // hub rank score for next iteration
Value *d_arank_next; // authority rank score for next iteration
VertexId *d_in_degrees; // in degrees for each node
VertexId *d_out_degrees; // out degrees for each node
VertexId *d_hub_predecessors; // hub graph predecessors (original graph)
VertexId *d_auth_predecessors; // authority graph predecessors (reverse graph)
};
SizeT nodes; // node number of the graph
SizeT edges; // edge number of the graph
SizeT out_nodes; // number of nodes that have outgoing edges
SizeT in_nodes; // number of nodes that have incoming edges
DataSlice *d_data_slices;
The constructor and destructor are ignored here.
"Extract" copies final hub rank scores and authority rank scores back to the CPU.
cudaError_t Extract(VertexId *h_hrank, VertexId *h_arank)
{
cudaError_t retval = cudaSuccess;
if (retval = util::GRError(CopyGPU2CPU(data_slices[0]->d_hrank_curr, h_hrank, nodes))) break;
if (retval = util::GRError(CopyGPU2CPU(data_slices[0]->d_arank_curr, h_arank, nodes))) break;
return retval;
}
SALSA is an algorithm for bipartite graphs. So the Init function takes two CSR graphs---the hub (original) graph and the auth (reverse) graph stored on the CPU, and uses them to initialize the graph topology data and the SALSA-specific DataSlice on the GPU.
cudaError_t Init(
const Csr<VertexId, Value, SizeT> &hub_graph,
const Csr<VertexId, Value, SizeT> &auth_graph)
{
cudaError_t retval = cudaSuccess;
if (retval = util::GRError(ProblemBase::Init(hub_graph, auth_graph))) break;
if (retval = util::GRError(GPUMalloc(data_slices[0]->d_hrank_curr, nodes))) break;
if (retval = util::GRError(GPUMalloc(data_slices[0]->d_arank_curr, nodes))) break;
if (retval = util::GRError(GPUMalloc(data_slices[0]->d_hrank_next, nodes))) break;
if (retval = util::GRError(GPUMalloc(data_slices[0]->d_arank_next, nodes))) break;
if (retval = util::GRError(GPUMalloc(data_slices[0]->d_in_degrees, nodes))) break;
if (retval = util::GRError(GPUMalloc(data_slices[0]->d_out_degrees, nodes))) break;
if (retval = util::GRError(GPUMalloc(data_slices[0]->d_hub_predecessors, edges))) break;
if (retval = util::GRError(GPUMalloc(data_slices[0]->d_auth_predecessors, edges))) break;
return retval;
}
The Reset function primes the graph data structure to an initial state, which includes ...
cudaError_t Reset(const Csr<VertexId, Value, SizeT> &graph)
{
cudaError_t retval = cudaSuccess;
if (retval = util::GRError(ProblemBase::Reset(graph))) break;
... initializing the hub and authority rank scores ...
util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_hrank_curr, (Value)1.0/out_nodes, nodes);
util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_arank_curr, (Value)1.0/in_nodes, nodes);
util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_hrank_next, 0, nodes);
util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_arank_next, 0, nodes);
... accurate in and out degrees for each node ...
util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_out_degrees, 0, nodes);
util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_in_degrees, 0, nodes);
util::MemsetMadVectorKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_out_degrees, BaseProblem::graph_slices[gpu]->d_row_offsets, &BaseProblem::graph_slices[gpu]->d_row_offsets[1], -1, nodes);
util::MemsetMadVectorKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_in_degrees, BaseProblem::graph_slices[gpu]->d_column_offsets, &BaseProblem::graph_slices[gpu]->d_column_offsets[1], -1, nodes);
... and initializing predecessors with an INVALID_PREDECESSOR_ID.
util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_hub_predecessors, INVALID_PREDECESSOR_ID, edges);
util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_auth_predecessors, INVALID_PREDECESSOR_ID, edges);
return retval;
}
};
Forward advance functors for the Hub nodes (original graph). Here, e_id_in is the incoming edge ID and e_id is the outgoing edge ID. This is essentially an initialization (setting all predecessors).
template<typename VertexId, typename SizeT, typename Value, typename ProblemData>
struct HFORWARDFunctor
{
typedef typename ProblemData::DataSlice DataSlice;
static __device__ __forceinline__ bool CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
{
return true;
}
static __device__ __forceinline__ void ApplyEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
{
For the hub graph (original graph), set each edge's source node ID
problem->d_hub_predecessors[e_id] = s_id;
}
};
Backward advance functors for the Hub nodes (original graph). The backward advance functors distribute ranks to nodes.
template<typename VertexId, typename SizeT, typename Value, typename ProblemData>
struct HBACKWARDFunctor
{
typedef typename ProblemData::DataSlice DataSlice;
static __device__ __forceinline__ bool CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
{
Choose nodes with non-zero outgoing degrees ...
VertexId v_id = problem->d_hub_predecessors[e_id_in];
bool flag = (problem->d_out_degrees[v_id] != 0);
if (!flag) problem->d_hrank_next[v_id] = 0;
return flag;
}
static __device__ __forceinline__ void ApplyEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
{
then atomically update hub ranks.
Value hrank_dst = problem->d_hrank_curr[d_id] / (problem->d_in_degrees[s_id] * problem->d_out_degrees[d_id]);
VertexId v_id = problem->d_hub_predecessors[e_id_in];
atomicAdd(&problem->d_hrank_next[v_id], hrank_dst);
}
};
Forward advance functors for the Authority nodes (reverse graph) Like the Hub forward advance functor, this just sets all predecessors.
template<typename VertexId, typename SizeT, typename Value, typename ProblemData>
struct AFORWARDFunctor
{
typedef typename ProblemData::DataSlice DataSlice;
static __device__ __forceinline__ bool CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
{
return true;
}
static __device__ __forceinline__ void ApplyEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
{
For authority graph (reverse graph), set each edge's source node ID
problem->d_auth_predecessors[e_id] = s_id;
}
};
Backward advance functors for the Authority nodes (reverse graph) The backward advance functors distribute ranks to nodes.
template<typename VertexId, typename SizeT, typename Value, typename ProblemData>
struct ABACKWARDFunctor
{
typedef typename ProblemData::DataSlice DataSlice;
static __device__ __forceinline__ bool CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
{
Choose nodes with non-zero outgoing degrees ...
VertexId v_id = problem->d_auth_predecessors[e_id_in];
bool flag = (problem->d_in_degrees[v_id] != 0);
if (!flag) problem->d_arank_next[v_id] = 0;
return flag;
}
static __device__ __forceinline__ void ApplyEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
{
... then atomically update authority ranks.
Value arank_dst = problem->d_arank_curr[d_id] / (problem->d_out_degrees[s_id] * problem->d_in_degrees[d_id]);
VertexId v_id = problem->d_auth_predecessors[e_id_in];
atomicAdd(&problem->d_arank_next[v_id], arank_dst);
}
};
The enactor defines how a graph primitive runs. It calls traversal (advance and filter operators) and computation (functors).
class SalsaEnactor : public EnactorBase {
For SALSA, Constructor, Destructor, and Setup functions are ignored
This user-defined function swaps current and next rank pointers
template <typename ProblemData>
void SwapRank(ProblemData *problem, int is_hub, int nodes)
{
typedef typename ProblemData::Value Value;
Value *rank_curr;
Value *rank_next;
if (is_hub) {
rank_curr = problem->data_slices[0]->d_hrank_curr;
rank_next = problem->data_slices[0]->d_hrank_next;
} else {
rank_curr = problem->data_slices[0]->d_hrank_curr;
rank_next = problem->data_slices[0]->d_hrank_next;
}
copy next to curr and reset next
util::MemsetCopyVectorKernel<<<128, 128>>>(rank_curr, rank_next, nodes);
util::MemsetKernel<<<128, 128>>>(rank_next, (Value)0.0, nodes);
}
This enactor defines the SALSA high-level algorithm.
template<
typename AdvancePolicy,
typename FilterPolicy,
typename SALSAProblem>
cudaError_t EnactSALSA(
CudaContext &context,
SALSAProblem *problem,
int max_iteration) {
typedef typename SALSAProblem::VertexId VertexId;
typedef typename SALSAProblem::SizeT SizeT;
typedef typename SALSAProblem::Value Value;
Define SALSA functors.
typedef HFORWARDFunctor<
VertexId,
SizeT,
Value,
SALSAProblem> HForwardFunctor;
typedef AFORWARDFunctor<
VertexId,
SizeT,
Value,
SALSAProblem> AForwardFunctor;
typedef HBACKWARDFunctor<
VertexId,
SizeT,
Value,
SALSAProblem> HBackwardFunctor;
typedef ABACKWARDFunctor<
VertexId,
SizeT,
Value,
SALSAProblem> ABackwardFunctor;
Load the Setup function.
cudaError_t retval = cudaSuccess;
if (retval = EnactorBase::Setup(problem)) break;
Define the graph topology data pointer (g_slice) and the problem-specific data pointer (d_slice).
typename SALSAProblem::GraphSlice *g_slice = problem->d_graph_slices;
typename SALSAProblem::DataSlice *d_slice = problem->d_data_slices;
Now let's do some computation.
SizeT queue_length = g_slice->nodes;
int selector = 0;
{
First we'll do some initialization code that runs just once. Start by initializing the frontier with all node IDs.
util::MemsetIdxKernel<<<BLOCK, THREAD>>>(g_slice->ping_pong_working_queue[selector], g_slice->nodes);
Set predecessor nodes for each edge in the original graph.
gunrock::oprtr::advance::Kernel
<AdvancePolicy, SALSAProblem, HForwardFunctor>
<<<advance_grid_size, AdvancePolicy::THREADS>>>(
queue_length,
g_slice->ping_pong_working_queue[selector],
g_slice->ping_pong_working_queue[selector^1],
g_slice->d_row_offsets,
g_slice->d_column_indices, //advance on original graph
d_slice
context,
gunrock::oprtr::advance::V2E);
And set the predecessor nodes for each edge in the reverse graph.
gunrock::oprtr::advance::Kernel
<AdvancePolicy, SALSAProblem, AForwardFunctor>
<<<advance_grid_size, AdvancePolicy::THREADS>>>(
queue_length,
g_slice->ping_pong_working_queue[selector],
g_slice->ping_pong_working_queue[selector^1],
g_slice->d_column_offsets,
g_slice->d_row_indices, //advance on reverse graph
d_slice
context,
gunrock::oprtr::advance::V2E);
}
Now we iterate between two Advance operators, which update (1) the hub rank and (2) the authority rank. We loop until we've reached the maximum iteration count.
int iteration = 0;
while (true) {
util::MemsetIdxKernel<<<BLOCK, THREAD>>>(g_slice->ping_pong_working_queue[selector], g_slice->edges);
SizeT queue_length = g_slice->edges;
This Advance operator updates the hub rank ...
gunrock::oprtr::advance::Kernel
<AdvancePolicy, SALSAProblem, ABackwardFunctor>
<<<advance_grid_size, AdvancePolicy::THREADS>>>(
queue_length,
g_slice->ping_pong_working_queue[selector],
g_slice->ping_pong_working_queue[selector^1],
g_slice->d_column_offsets,
g_slice->d_row_indices, //advance backward on reverse graph
d_slice
context,
gunrock::oprtr::advance::E2V);
SwapRank<SALSAProblem>(problem, 0, g_slice->nodes);
and here, the authority rank.
gunrock::oprtr::advance::Kernel
<AdvancePolicy, SALSAProblem, ABackwardFunctor>
<<<advance_grid_size, AdvancePolicy::THREADS>>>(
queue_length,
g_slice->ping_pong_working_queue[selector],
g_slice->ping_pong_working_queue[selector^1],
g_slice->d_row_offsets,
g_slice->d_column_indices, //advance backward on original graph
d_slice
context,
gunrock::oprtr::advance::E2V);
SwapRank<SALSAProblem>(problem, 0, g_slice->nodes);
iteration++;
if (iteration >= max_iteration) break;
}
return retval;
}
The entry point in the driver code to SALSA is this Enact call.
template <typename SALSAProblem>
cudaError_t Enact(
CudaContext &context,
SALSAProblem *problem,
typename SALSAProblem::SizeT max_iteration)
{
Gunrock provides recommended settings here for kernel parameters, but they can be changed by end-users.
typedef gunrock::oprtr::filter::KernelPolicy<
SALSAProblem,
300, //CUDA_ARCH
8, //MIN_CTA_OCCUPANCY
8> //LOG_THREAD_NUM
FilterKernelPolicy;
typedef gunrock::oprtr::advance::KernelPolicy<
SALSAProblem,
300,
8, //MIN_CTA_OCCUPANCY
10, //LOG_THREAD_NUM
32*128> //THRESHOLD_TO_SWITCH_ADVANCE_MODE
AdvanceKernelPolicy;
return EnactSALSA<AdvanceKernelPolicy, FilterKernelPolicy, SALSAProblem>(
context, problem, max_iteration);
}
};