all annotated code

#

This data structure (the "Problem" struct) stores the graph topology in CSR format and the frontier. All Problem structs inherit from the ProblemBase struct. Algorithm-specific data is stored in a "DataSlice".

template<
         typename VertexId,
         typename SizeT,
         typename Value,
         bool _MARK_PREDECESSORS, // Whether to mark predecessor ID when advance
         bool _ENABLE_IDEMPOTENCE, // Whether to enable idempotence when advance
         bool _USE_DOUBLE_BUFFER>
         struct BFSProblem : public ProblemBase<VertexId, SizeT, _USE_DOUBLE_BUFFER>
{
#

MARK_PREDECESSORS sets the predecessor node ID during a traversal for each node in the new frontier.

    static const bool MARK_PREDECESSORS     = _MARK_PREDECESSORS;
#

ENABLE_IDEMPOTENCE is an optimization when the operation performed in parallel for all neighbor nodes/edges is idempotent, meaning data races are benign.

    static const bool ENABLE_IDEMPOTENCE    = _ENABLE_IDEMPOTENCE;
#

The DataSlice struct stores per-node or per-edge arrays and global variables (if any) that are specific to this particular algorithm. Here, we store the depth value and predecessor node ID for each node.

    struct DataSlice
    {
        VertexId *d_labels; // BFS depth value
        VertexId *d_preds; // Predecessor IDs
    };

    SizeT nodes;
    SizeT edges;
    DataSlice *d_data_slices;
#

The constructor and destructor are ignored here.

#

"Extract" copies labels and predecessors back to the CPU.

    cudaError_t Extract(VertexId *h_labels, VertexId *h_preds)
    {
        cudaError_t retval = cudaSuccess;
        if (retval = util::GRError(CopyGPU2CPU(data_slices[0]->d_labels, h_labels, nodes))) break;
        if (retval = util::GRError(CopyGPU2CPU(data_slices[0]->d_preds, h_preds, nodes))) break;
        return retval;
    }
#

The Init function initializes this Problem struct with a CSR graph that's stored on the CPU. It also initializes the algorithm-specific data, here depth and predecessor.

    cudaError_t Init(
            const Csr<VertexId, Value, SizeT> &graph)
    {
        cudaError_t retval = cudaSuccess;
        if (retval = util::GRError(ProblemBase::Init(graph))) break;
        if (retval = util::GRError(GPUMalloc(data_slices[0]->d_labels, nodes))) break;
        if (retval = util::GRError(GPUMalloc(data_slices[0]->d_preds, nodes))) break;
        return retval;
    }
#

The Reset function primes the graph data structure to an untraversed state.

    cudaError_t Reset(
            const Csr<VertexId, Value, SizeT> &graph, VertexId src)
    {
        cudaError_t retval = cudaSuccess;
        if (retval = util::GRError(ProblemBase::Reset(graph))) break;
#

Set all depth and predecessor values to invalid. Set the source node's depth value to 0.

        util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_labels, INVALID_NODE_VALUE, nodes);
        util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_preds, INVALID_PREDECESSOR_ID, nodes);
        if (retval = util::GRError(CopyGPU2CPU(data_slices[0]->d_labels+src, 0, 1)));
#

Put the source node ID into the initial frontier.

        if (retval = util::GRError(CopyGPU2CPU(g_slices[0]->ping_pong_working_queue, src, 1)));
        return retval;
    }
};

bfs_functor.cuh

#

bfs_functor defines user-specific computations with (1) two per-edge functors, CondEdge and ApplyEdge, which will be used in the Advance operator; and (2) two per-node functors, CondVertex and ApplyVertex, which will be used in the Filter operator.

template<typename VertexId, typename SizeT, typename Value, typename ProblemData>
struct BFSFunctor {
    typedef typename ProblemData::DataSlice DataSlice;

    __device__ bool CondEdge(VertexId s_id, VertexId d_id, DataSlice *p)
    {
        if (ProblemData::MARK_PREDECESSORS)
#

Set predecessor for each destination node. We set the depth later, because we only want to set the depth for valid nodes.

            return (atomicCAS(&p->d_preds[d_id], INVALID_PREDECESSOR_ID, s_id) == INVALID_PREDECESSOR_ID)
                ? true : false;
        else
#

If we're not keeping track of predecessors, we can immediately set the depth of the destination vertex to one plus the source vertex's depth.

            return (atomicCAS(&p->d_labels[d_id], INVALID_NODE_VALUE, s_id+1) == INVALID_NODE_VALUE)
                ? true : false;
    }
#

ApplyEdge here increments the depth value.

    __device__ void ApplyEdge(VertexId s_id, VertexId d_id, DataSlice *p)
    {
        if (ProblemData::MARK_PREDECESSORS)
#

We know the destination node is valid (from CondEdge), so here we set its depth to one plus the source vertex's depth.

            p->d_labels[d_id] = p->d_labels[s_id]+1;
    }
#

In BFS, CondVertex checks if the vertex is valid in the next frontier.

    __device__ void CondVertex(VertexId node, DataSlice *p)
    {
        return node != INVALID_NODE_ID;
    }
#

In BFS, we don't apply any actions to vertices.

    __device__ void ApplyVertex(VertexId node, DataSlice *p)
    {
    }
};

bfs_enactor.cuh

#

The enactor defines how a graph primitive runs. It calls traversal (advance and filter operators) and computation (functors).

#
class BFSEnactor : public EnactorBase {
#

For BFS, Constructor, Destructor, and Setup functions are ignored

    template<
    typename AdvancePolicy,
    typename FilterPolicy,
    typename BFSProblem>
    cudaError_t EnactBFS(CudaContext &context,BFSProblem *problem,VertexId   src)
    {

        typedef BFSFunctor<
        typename BFSProblem::VertexId,
        typename BFSProblem::SizeT,
        typename BFSProblem::VertexId,
        BFSProblem> BfsFunctor;
#

Start with the Setup function to initialize kernel running parameters

        cudaError_t retval = cudaSuccess;
        if (retval = EnactorBase::Setup(problem)) break;
#

Define the graph topology data pointer (g_slice) and the problem-specific data pointer (d_slice)

        typename BFSProblem::GraphSlice *g_slice = problem->d_graph_slices;
        typename BFSProblem::DataSlice *d_slice = problem->d_data_slices;
#

Initialize the queue length (frontier size) to 1.

        SizeT queue_length = 1;
#

We ping-pong between old and new frontiers; "selector" picks which one is the current destination.

        int selector = 0;
#

Here we sequence our operators and functors. For BFS, we alternate between advancing to a new frontier (vertex-to-vertex), calling the BFS functor to set depths along the way, and then filtering out invalid nodes from the new frontier. We repeat until the frontier is empty.

        while (queue_length > 0) {
            gunrock::oprtr::advance::Kernel
                <AdvancePolicy, BFSProblem, BFSFunctor>
                <<<advance_grid_size, AdvancePolicy::THREADS>>>(
                        queue_length,
                        g_slice->ping_pong_working_queue[selector],
                        g_slice->ping_pong_working_queue[selector^1],
                        d_slice,
                        context,
#

This advance is vertex-to-vertex

                        gunrock::oprtr::advance::V2V);

            selector ^= 1; // Swap selector

            gunrock::oprtr::filter::Kernel
                <FilterPolicy, BFSProblem, BFSFunctor>
                <<<filter_grid_size, FilterPolicy::THREADS>>>(
                        queue_length,
                        g_slice->ping_pong_working_queue[selector],
                        g_slice->ping_pong_working_queue[selector^1],
                        d_slice);
        }
        return retval;
    }
#

The entry point in the driver code to BFS is this Enact call.

    template <typename BFSProblem>
        cudaError_t Enact(
                CudaContext &context,
                BFSProblem *problem, // Problem data sent in
                typename BFSProblem::VertexId src) // Source node ID for BFS
        {
#

Gunrock provides recommended settings here for kernel parameters, but they can be changed by end-users.

            typedef gunrock::oprtr::filter::KernelPolicy<
                BFSProblem,
                300, //CUDA_ARCH
                8, //MIN_CTA_OCCUPANCY
                8> //LOG_THREAD_NUM
                FilterKernelPolicy;

            typedef gunrock::oprtr::advance::KernelPolicy<
                BFSProblem,
                300,//CUDA_ARCH
                8, //MIN_CTA_OCCUPANCY
                10, //LOG_THREAD_NUM
                32*128> //THRESHOLD_TO_SWITCH_ADVANCE_MODE
                AdvanceKernelPolicy;

            return EnactBFS<AdvanceKernelPolicy, FilterKernelPolicy, BFSProblem>(context, problem, src);
        }
};

salsa_problem.cuh

#

This data structure (the "Problem" struct) stores the graph topology in CSR format and the frontier. All Problem structs inherit from the ProblemBase struct. Algorithm-specific data is stored in a "DataSlice".

template<
typename VertexId,
typename SizeT,
typename Value>
struct SALSAProblem : public ProblemBase
{

    static const bool MARK_PREDECESSORS     = true; // for SALSA algorithm, we need to track predecessors in Advance
    static const bool ENABLE_IDEMPOTENCE    = false; // In SALSA, data races during Advance are not allowed.

    struct DataSlice
    {
        Value *d_hrank_curr; // hub rank score for current iteration
        Value *d_arank_curr; // authority rank score for current iteration
        Value *d_hrank_next; // hub rank score for next iteration
        Value *d_arank_next; // authority rank score for next iteration
        VertexId *d_in_degrees; // in degrees for each node
        VertexId *d_out_degrees; // out degrees for each node
        VertexId *d_hub_predecessors; // hub graph predecessors (original graph)
        VertexId *d_auth_predecessors; // authority graph predecessors (reverse graph)
    };

    SizeT nodes; // node number of the graph
    SizeT edges; // edge number of the graph
    SizeT out_nodes; // number of nodes that have outgoing edges
    SizeT in_nodes; // number of nodes that have incoming edges
    DataSlice *d_data_slices;
#

The constructor and destructor are ignored here.

#

"Extract" copies final hub rank scores and authority rank scores back to the CPU.

    cudaError_t Extract(VertexId *h_hrank, VertexId *h_arank)
    {
        cudaError_t retval = cudaSuccess;
        if (retval = util::GRError(CopyGPU2CPU(data_slices[0]->d_hrank_curr, h_hrank, nodes))) break;
        if (retval = util::GRError(CopyGPU2CPU(data_slices[0]->d_arank_curr, h_arank, nodes))) break;
        return retval;
    }
#

SALSA is an algorithm for bipartite graphs. So the Init function takes two CSR graphs---the hub (original) graph and the auth (reverse) graph stored on the CPU, and uses them to initialize the graph topology data and the SALSA-specific DataSlice on the GPU.

    cudaError_t Init(
            const Csr<VertexId, Value, SizeT> &hub_graph,
            const Csr<VertexId, Value, SizeT> &auth_graph)
    {
        cudaError_t retval = cudaSuccess;
        if (retval = util::GRError(ProblemBase::Init(hub_graph, auth_graph))) break;
        if (retval = util::GRError(GPUMalloc(data_slices[0]->d_hrank_curr, nodes))) break;
        if (retval = util::GRError(GPUMalloc(data_slices[0]->d_arank_curr, nodes))) break;
        if (retval = util::GRError(GPUMalloc(data_slices[0]->d_hrank_next, nodes))) break;
        if (retval = util::GRError(GPUMalloc(data_slices[0]->d_arank_next, nodes))) break;
        if (retval = util::GRError(GPUMalloc(data_slices[0]->d_in_degrees, nodes))) break;
        if (retval = util::GRError(GPUMalloc(data_slices[0]->d_out_degrees, nodes))) break;
        if (retval = util::GRError(GPUMalloc(data_slices[0]->d_hub_predecessors, edges))) break;
        if (retval = util::GRError(GPUMalloc(data_slices[0]->d_auth_predecessors, edges))) break;
        return retval;
    }
#

The Reset function primes the graph data structure to an initial state, which includes ...

    cudaError_t Reset(const Csr<VertexId, Value, SizeT> &graph)
    {
        cudaError_t retval = cudaSuccess;
        if (retval = util::GRError(ProblemBase::Reset(graph))) break;
#

... initializing the hub and authority rank scores ...

        util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_hrank_curr, (Value)1.0/out_nodes, nodes);
        util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_arank_curr, (Value)1.0/in_nodes, nodes);
        util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_hrank_next, 0, nodes);
        util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_arank_next, 0, nodes);
#

... accurate in and out degrees for each node ...

        util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_out_degrees, 0, nodes);
        util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_in_degrees, 0, nodes);
        util::MemsetMadVectorKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_out_degrees, BaseProblem::graph_slices[gpu]->d_row_offsets, &BaseProblem::graph_slices[gpu]->d_row_offsets[1], -1, nodes);
        util::MemsetMadVectorKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_in_degrees, BaseProblem::graph_slices[gpu]->d_column_offsets, &BaseProblem::graph_slices[gpu]->d_column_offsets[1], -1, nodes);
#

... and initializing predecessors with an INVALID_PREDECESSOR_ID.

        util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_hub_predecessors, INVALID_PREDECESSOR_ID, edges);
        util::MemsetKernel<<<BLOCK, THREAD>>>(data_slices[0]->d_auth_predecessors, INVALID_PREDECESSOR_ID, edges);

        return retval;
    }
};

salsa_functor.cuh

#

Forward advance functors for the Hub nodes (original graph). Here, e_id_in is the incoming edge ID and e_id is the outgoing edge ID. This is essentially an initialization (setting all predecessors).

template<typename VertexId, typename SizeT, typename Value, typename ProblemData>
struct HFORWARDFunctor
{
    typedef typename ProblemData::DataSlice DataSlice;
    static __device__ __forceinline__ bool CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
    {
        return true;
    }

    static __device__ __forceinline__ void ApplyEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
    {
#

For the hub graph (original graph), set each edge's source node ID

        problem->d_hub_predecessors[e_id] = s_id;
    }

};
#

Backward advance functors for the Hub nodes (original graph). The backward advance functors distribute ranks to nodes.

template<typename VertexId, typename SizeT, typename Value, typename ProblemData>
struct HBACKWARDFunctor
{
    typedef typename ProblemData::DataSlice DataSlice;
    static __device__ __forceinline__ bool CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
    {
#

Choose nodes with non-zero outgoing degrees ...

        VertexId v_id = problem->d_hub_predecessors[e_id_in];
        bool flag = (problem->d_out_degrees[v_id] != 0);
        if (!flag) problem->d_hrank_next[v_id] = 0;
        return flag;
    }

    static __device__ __forceinline__ void ApplyEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
    {
#

then atomically update hub ranks.

        Value hrank_dst = problem->d_hrank_curr[d_id] / (problem->d_in_degrees[s_id] * problem->d_out_degrees[d_id]);
        VertexId v_id = problem->d_hub_predecessors[e_id_in];
        atomicAdd(&problem->d_hrank_next[v_id], hrank_dst);
    }
};
#

Forward advance functors for the Authority nodes (reverse graph) Like the Hub forward advance functor, this just sets all predecessors.

template<typename VertexId, typename SizeT, typename Value, typename ProblemData>
struct AFORWARDFunctor
{
    typedef typename ProblemData::DataSlice DataSlice;

    static __device__ __forceinline__ bool CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
    {
        return true;
    }

    static __device__ __forceinline__ void ApplyEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
    {
#

For authority graph (reverse graph), set each edge's source node ID

        problem->d_auth_predecessors[e_id] = s_id;
    }

};
#

Backward advance functors for the Authority nodes (reverse graph) The backward advance functors distribute ranks to nodes.

template<typename VertexId, typename SizeT, typename Value, typename ProblemData>
struct ABACKWARDFunctor
{
    typedef typename ProblemData::DataSlice DataSlice;

    static __device__ __forceinline__ bool CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
    {
#

Choose nodes with non-zero outgoing degrees ...

        VertexId v_id = problem->d_auth_predecessors[e_id_in];
        bool flag = (problem->d_in_degrees[v_id] != 0);
        if (!flag) problem->d_arank_next[v_id] = 0;
        return flag;
    }

    static __device__ __forceinline__ void ApplyEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
    {
#

... then atomically update authority ranks.

        Value arank_dst = problem->d_arank_curr[d_id] / (problem->d_out_degrees[s_id] * problem->d_in_degrees[d_id]);
        VertexId v_id = problem->d_auth_predecessors[e_id_in];
        atomicAdd(&problem->d_arank_next[v_id], arank_dst);
    }
};

salsa_enactor.cuh

#

The enactor defines how a graph primitive runs. It calls traversal (advance and filter operators) and computation (functors).

#
class SalsaEnactor : public EnactorBase {
#

For SALSA, Constructor, Destructor, and Setup functions are ignored

#

This user-defined function swaps current and next rank pointers

    template <typename ProblemData>
        void SwapRank(ProblemData *problem, int is_hub, int nodes)
        {
            typedef typename ProblemData::Value Value;
            Value *rank_curr;
            Value *rank_next;
            if (is_hub) {
                rank_curr = problem->data_slices[0]->d_hrank_curr;
                rank_next = problem->data_slices[0]->d_hrank_next;
            } else {
                rank_curr = problem->data_slices[0]->d_hrank_curr;
                rank_next = problem->data_slices[0]->d_hrank_next;
            }
#

copy next to curr and reset next

            util::MemsetCopyVectorKernel<<<128, 128>>>(rank_curr, rank_next, nodes);
            util::MemsetKernel<<<128, 128>>>(rank_next, (Value)0.0, nodes);
        }
#

This enactor defines the SALSA high-level algorithm.

    template<
        typename AdvancePolicy,
        typename FilterPolicy,
        typename SALSAProblem>
                     cudaError_t EnactSALSA(
                             CudaContext &context,
                             SALSAProblem *problem,
                             int max_iteration) {

                         typedef typename SALSAProblem::VertexId VertexId;
                         typedef typename SALSAProblem::SizeT SizeT;
                         typedef typename SALSAProblem::Value Value;
#

Define SALSA functors.

                         typedef HFORWARDFunctor<
                             VertexId,
                             SizeT,
                             Value,
                             SALSAProblem> HForwardFunctor;

                         typedef AFORWARDFunctor<
                             VertexId,
                             SizeT,
                             Value,
                             SALSAProblem> AForwardFunctor;

                         typedef HBACKWARDFunctor<
                             VertexId,
                             SizeT,
                             Value,
                             SALSAProblem> HBackwardFunctor;

                         typedef ABACKWARDFunctor<
                             VertexId,
                             SizeT,
                             Value,
                             SALSAProblem> ABackwardFunctor;
#

Load the Setup function.

                         cudaError_t retval = cudaSuccess;
                         if (retval = EnactorBase::Setup(problem)) break;
#

Define the graph topology data pointer (g_slice) and the problem-specific data pointer (d_slice).

                         typename SALSAProblem::GraphSlice *g_slice = problem->d_graph_slices;
                         typename SALSAProblem::DataSlice *d_slice = problem->d_data_slices;
#

Now let's do some computation.

                         SizeT queue_length = g_slice->nodes;
                         int selector = 0;
                         {
#

First we'll do some initialization code that runs just once. Start by initializing the frontier with all node IDs.

                             util::MemsetIdxKernel<<<BLOCK, THREAD>>>(g_slice->ping_pong_working_queue[selector], g_slice->nodes);
#

Set predecessor nodes for each edge in the original graph.

                             gunrock::oprtr::advance::Kernel
                                 <AdvancePolicy, SALSAProblem, HForwardFunctor>
                                 <<<advance_grid_size, AdvancePolicy::THREADS>>>(
                                         queue_length,
                                         g_slice->ping_pong_working_queue[selector],
                                         g_slice->ping_pong_working_queue[selector^1],
                                         g_slice->d_row_offsets,
                                         g_slice->d_column_indices, //advance on original graph
                                         d_slice
                                         context,
                                         gunrock::oprtr::advance::V2E);
#

And set the predecessor nodes for each edge in the reverse graph.

                             gunrock::oprtr::advance::Kernel
                                 <AdvancePolicy, SALSAProblem, AForwardFunctor>
                                 <<<advance_grid_size, AdvancePolicy::THREADS>>>(
                                         queue_length,
                                         g_slice->ping_pong_working_queue[selector],
                                         g_slice->ping_pong_working_queue[selector^1],
                                         g_slice->d_column_offsets,
                                         g_slice->d_row_indices, //advance on reverse graph
                                         d_slice
                                         context,
                                         gunrock::oprtr::advance::V2E);
                         }
#

Now we iterate between two Advance operators, which update (1) the hub rank and (2) the authority rank. We loop until we've reached the maximum iteration count.

                         int iteration = 0;
                         while (true) {
                             util::MemsetIdxKernel<<<BLOCK, THREAD>>>(g_slice->ping_pong_working_queue[selector], g_slice->edges);
                             SizeT queue_length = g_slice->edges;
#

This Advance operator updates the hub rank ...

                             gunrock::oprtr::advance::Kernel
                                 <AdvancePolicy, SALSAProblem, ABackwardFunctor>
                                 <<<advance_grid_size, AdvancePolicy::THREADS>>>(
                                         queue_length,
                                         g_slice->ping_pong_working_queue[selector],
                                         g_slice->ping_pong_working_queue[selector^1],
                                         g_slice->d_column_offsets,
                                         g_slice->d_row_indices, //advance backward on reverse graph
                                         d_slice
                                         context,
                                         gunrock::oprtr::advance::E2V);

                             SwapRank<SALSAProblem>(problem, 0, g_slice->nodes);
#

and here, the authority rank.

                             gunrock::oprtr::advance::Kernel
                                 <AdvancePolicy, SALSAProblem, ABackwardFunctor>
                                 <<<advance_grid_size, AdvancePolicy::THREADS>>>(
                                         queue_length,
                                         g_slice->ping_pong_working_queue[selector],
                                         g_slice->ping_pong_working_queue[selector^1],
                                         g_slice->d_row_offsets,
                                         g_slice->d_column_indices, //advance backward on original graph
                                         d_slice
                                         context,
                                         gunrock::oprtr::advance::E2V);

                             SwapRank<SALSAProblem>(problem, 0, g_slice->nodes);

                             iteration++;
                             if (iteration >= max_iteration) break;
                         }

                         return retval;
                     }
#

The entry point in the driver code to SALSA is this Enact call.

    template <typename SALSAProblem>
        cudaError_t Enact(
                CudaContext &context,
                SALSAProblem *problem,
                typename SALSAProblem::SizeT max_iteration)
        {
#

Gunrock provides recommended settings here for kernel parameters, but they can be changed by end-users.

            typedef gunrock::oprtr::filter::KernelPolicy<
                SALSAProblem,
                300, //CUDA_ARCH
                8, //MIN_CTA_OCCUPANCY
                8> //LOG_THREAD_NUM
                FilterKernelPolicy;

            typedef gunrock::oprtr::advance::KernelPolicy<
                SALSAProblem,
                300,
                8, //MIN_CTA_OCCUPANCY
                10, //LOG_THREAD_NUM
                32*128> //THRESHOLD_TO_SWITCH_ADVANCE_MODE
                AdvanceKernelPolicy;

            return EnactSALSA<AdvanceKernelPolicy, FilterKernelPolicy, SALSAProblem>(
                    context, problem, max_iteration);
        }
};