Source code for grape.parallel_general_graph

"""ParallelGeneralGraph for parallel directed graphs (DiGraph) module"""

import logging
import sys
import warnings
from multiprocessing import Queue
import multiprocessing as mp
from multiprocessing.sharedctypes import RawArray
import ctypes
import numpy as np
import networkx as nx

from .utils import chunk_it
from .general_graph import GeneralGraph

warnings.simplefilter(action='ignore', category=FutureWarning)
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)


[docs]class ParallelGeneralGraph(GeneralGraph):
    """
    Class ParallelGeneralGraph for parallel implementation of
    directed graphs (DiGraph).

    Constructs a new graph given an input file.
    A DiGraph stores nodes and edges with optional data or attributes.
    DiGraphs hold directed edges.
    Nodes can be arbitrary python objects with optional key/value attributes.
    Edges are represented  as links between nodes with optional key/value
    attributes.
    """

[docs]    def __init__(self):
        super().__init__()
        self.manager = mp.Manager()
        self.num = mp.cpu_count()

[docs]    def measure_iteration(self, nodes, record, kernel, *measure_args):
        """

        Inner iteration for parallel measures,
        to update shared dictionary.

        :param list nodes: nodes for which to compute the
            shortest path between them and all the other nodes.
        :param multiprocessing.managers.dict record:
            shared dictionary to be updated.
        :param callable kernel: kernel measure to be computed.
        :param *measure_args: arguments for kernel measures.
           Have a look at specific kernel measures in GeneralGraph for
           the particular variables/types for each measure.
        """

        partial_dict = kernel(nodes, *measure_args)
        record.update(partial_dict)

[docs]    def measure_processes(self, record, kernel, *measure_args):
        """

        Division of total number of nodes in chuncks and
        parallel distribution of tasks into processes,
        for different kernel measure functions.

        :param multiprocessing.managers.dict record:
            shared dictionary to be updated
        :param callable kernel: kernel measure to be computed
        :param *measure_args: arguments for kernel measures
           (have a look at specific kernel measures in GeneralGraph
           for the particular variables/types for each measure)
        """
        node_chunks = chunk_it(list(self.nodes()), self.num)

        processes = [
            mp.Process( target=self.measure_iteration,
            args=(node_chunks[p], record, kernel, *measure_args) )
            for p in range(self.num) ]

        for proc in processes:
            proc.start()

        for proc in processes:
            proc.join()

[docs]    def floyd_warshall_predecessor_and_distance(self):
        """

        Parallel Floyd Warshall's APSP algorithm. The predecessors
        and distance matrices are evaluated, together with the nested
        dictionaries for shortest-path, length of the paths and
        efficiency attributes.

        .. note:: Edges weight is taken into account in the distance matrix.
            Edge weight attributes must be numerical. Distances are calculated
            as sums of weighted edges traversed.

        :return: nested dictionary with key corresponding to
            source, while as value a dictionary keyed by target and valued
            by the source-target shortest path;
            nested dictionary with key corresponding to
            source, while as value a dictionary keyed by target and valued
            by the source-target shortest path length.
        :rtype: dict, dict
        """

        dist, pred = self.floyd_warshall_initialization()

        shared_d = mp.sharedctypes.RawArray(ctypes.c_double, dist.shape[0]**2)
        dist_shared = np.frombuffer(shared_d, 'float64').reshape(dist.shape)
        dist_shared[:] = dist

        shared_p = mp.sharedctypes.RawArray(ctypes.c_double,pred.shape[0]**2)
        pred_shared = np.frombuffer(shared_p, 'float64').reshape(pred.shape)
        pred_shared[:] = pred

        n = len(self.nodes())
        chunk = [(0, int(n / self.num))]
        node_chunks = chunk_it(list(self.nodes()), self.num)

        for i in range(1, self.num):
            chunk.append((chunk[i - 1][1],
                          chunk[i - 1][1] + len(node_chunks[i])))

        barrier = mp.Barrier(self.num)
        processes = [
            mp.Process( target=self.floyd_warshall_kernel,
            args=(dist_shared, pred_shared, chunk[p][0], chunk[p][1], barrier))
            for p in range(self.num) ]

        for proc in processes:
            proc.start()

        for proc in processes:
            proc.join()

        all_shortest_path = self.manager.dict()

        processes = [
            mp.Process( target=self.measure_iteration,
            args=(list(map(self.ids_reversed.get, node_chunks[p])),
                all_shortest_path, self.construct_path_kernel, pred_shared) )
            for p in range(self.num) ]

        for proc in processes:
            proc.start()

        for proc in processes:
            proc.join()

        nonempty_shortest_path = {}
        for k in all_shortest_path.keys():
            nonempty_shortest_path[k] = {
                key: value
                for key, value in all_shortest_path[k].items() if value
            }

        shortest_path_length = {}
        for i in list(self.H):

            shortest_path_length[self.ids[i]] = {}

            for key, value in nonempty_shortest_path[self.ids[i]].items():
                length_path = dist_shared[self.ids_reversed[value[0]],
                                          self.ids_reversed[value[-1]]]
                shortest_path_length[self.ids[i]][key] = length_path

        return nonempty_shortest_path, shortest_path_length

[docs]    def dijkstra_iteration_parallel(self, out_queue, nodes):
        """

        Parallel SSSP algorithm based on Dijkstra’s method.

        :param multiprocessing.queues.Queue out_queue: multiprocessing queue
        :param list nodes: list of starting nodes from which the SSSP should be
            computed to every other target node in the graph

        .. note:: Edges weight is taken into account.
            Edge weight attributes must be numerical.
            Distances are calculated as sums of weighted edges traversed.
        """

        for n in nodes:
            ssspp = (n, nx.single_source_dijkstra(self, n, weight='weight'))
            out_queue.put(ssspp)

[docs]    def dijkstra_single_source_shortest_path(self):
        """

        Wrapper for parallel SSSP algorithm based on Dijkstra’s method.
        The nested dictionaries for shortest-path, length of the paths and
        efficiency attributes are evaluated.

        .. note:: Edges weight is taken into account.
            Edge weight attributes must be numerical.
            Distances are calculated as sums of weighted edges traversed.

        :return: nested dictionary with key corresponding to
            source, while as value a dictionary keyed by target and valued
            by the source-target shortest path;
            nested dictionary with key corresponding to
            source, while as value a dictionary keyed by target and valued
            by the source-target shortest path length.
        :rtype: dict, dict
        """

        attribute_ssspp = []
        out_queue = Queue()
        node_chunks = chunk_it(list(self.nodes()), self.num)

        processes = [
            mp.Process( target=self.dijkstra_iteration_parallel,
            args=( out_queue,node_chunks[p] ))
            for p in range(self.num) ]

        for proc in processes:
            proc.start()

        while 1:
            running = any(p.is_alive() for p in processes)
            while not out_queue.empty():

                attribute_ssspp.append(out_queue.get())

            if not running:
                break

        shortest_path = {}
        shortest_path_length = {}
        for ssspp in attribute_ssspp:

            n = ssspp[0]
            shortest_path[n] = ssspp[1][1]
            shortest_path_length[n] = ssspp[1][0]

        return shortest_path, shortest_path_length

[docs]    def calculate_shortest_path(self):
        """

        Choose the most appropriate way to compute the all-pairs shortest
        path depending on graph size and density.
        For a dense graph choose Floyd Warshall algorithm.
        For a sparse graph choose SSSP algorithm based on Dijkstra's method.

        .. note:: Edge weights of the graph are taken into account
            in the computation.

        :return: nested dictionary with key corresponding to
            source, while as value a dictionary keyed by target and valued
            by the source-target shortest path;
            nested dictionary with key corresponding to
            source, while as value a dictionary keyed by target and valued
            by the source-target shortest path length.
        :rtype: dict, dict
        """

        n_of_nodes = self.order()
        graph_density = nx.density(self)

        logging.debug('Number of processors: %s', self.num)

        logging.debug('In the graph are present %s nodes', n_of_nodes)
        if graph_density <= 0.000001:
            logging.debug('The graph is sparse, density = %s', graph_density)
            shpath, shpath_len = self.dijkstra_single_source_shortest_path()
        else:
            logging.debug('The graph is dense, density = %s', graph_density)
            shpath, shpath_len = self.floyd_warshall_predecessor_and_distance()

        return shpath, shpath_len

[docs]    def compute_efficiency(self):
        """

        Efficiency calculation.

        .. note:: The efficiency of a path connecting two nodes is defined
            as the inverse of the path length, if the path has length non-zero,
            and zero otherwise.

        :return: efficiency computed for every node.
            The keys correspond to source, while as value a dictionary keyed
            by target and valued by the source-target efficiency.
        :rtype: multiprocessing.managers.dict
        """

        shortest_path_length = self.shortest_path_length
        efficiency = self.manager.dict()
        self.measure_processes(efficiency, self.efficiency_kernel,
            shortest_path_length)
        return efficiency

[docs]    def compute_nodal_efficiency(self):
        """

        Nodal efficiency calculation.

        .. note:: The nodal efficiency of the node is equal to zero
            for a node without any outgoing path and equal to one if from it
            we can reach each node of the digraph.

        :return: nodal efficiency computed for every node.
        :rtype: multiprocessing.managers.dict
        """

        graph_size = len(list(self))
        efficiency = self.efficiency
        nodal_efficiency = self.manager.dict()
        self.measure_processes(nodal_efficiency, self.nodal_efficiency_kernel,
            efficiency, graph_size)
        return nodal_efficiency

[docs]    def compute_local_efficiency(self):
        """

        Local efficiency calculation.

        .. note:: The local efficiency shows the efficiency of the connections
            between the first-order outgoing neighbors of node v
            when v is removed. Equivalently, local efficiency measures
            the resilience of the digraph to the perturbation of node removal,
            i.e. if we remove a node, how efficiently its first-order outgoing
            neighbors can communicate.
            It is in the range [0, 1].

        :return: local efficiency computed for every node.
        :rtype: multiprocessing.managers.dict
        """

        nodal_efficiency = self.nodal_efficiency
        local_efficiency = self.manager.dict()
        self.measure_processes(local_efficiency, self.local_efficiency_kernel,
            nodal_efficiency)
        return local_efficiency

[docs]    def shortest_path_list_iteration(self, nodes, shortest_path,
        tot_shortest_paths_list):
        """

        Inner iteration for parallel shortest path list calculation,
        to update shared list.

        :param list nodes: list of nodes for which to compute the
            shortest path between them and all the other nodes
        :param dict shortest_path: nested dictionary with key
            corresponding to source, while as value a dictionary keyed by target
            and valued by the source-target shortest path.
        :param tot_shortest_paths_list: list of shortest paths
            with at least two nodes
        :type tot_shortest_paths_list: multiprocessing.managers.list
        """

        partial_shortest_paths_list = self.shortest_path_list_kernel(nodes,
            shortest_path)
        tot_shortest_paths_list.extend(partial_shortest_paths_list)

[docs]    def compute_betweenness_centrality(self):
        """

        Betweenness_centrality calculation.

        .. note:: Betweenness centrality is an index of the relative importance
            of a node and it is defined by the number of shortest paths that run
            through it.
            Nodes with the highest betweenness centrality hold the higher level
            of control on the information flowing between different nodes in
            the network, because more information will pass through them.

        :return: betweenness centrality computed for every node.
        :rtype: multiprocessing.managers.dict
        """

        shortest_path = self.shortest_path
        tot_shortest_paths_list = self.manager.list()
        node_chunks = chunk_it(list(self.nodes()), self.num)

        processes = [
            mp.Process( target=self.shortest_path_list_iteration,
            args=(node_chunks[p], shortest_path, tot_shortest_paths_list) )
            for p in range(self.num) ]

        for proc in processes:
            proc.start()

        for proc in processes:
            proc.join()

        betweenness_centrality = self.manager.dict()
        self.measure_processes(betweenness_centrality,
            self.betweenness_centrality_kernel, tot_shortest_paths_list)

        return betweenness_centrality

[docs]    def compute_closeness_centrality(self):
        """

        Closeness_centrality calculation.

        .. note:: Closeness centrality measures the reciprocal of the
            average shortest path distance from a node to all other reachable
            nodes in the graph. Thus, the more central a node is, the closer
            it is to all other nodes. This measure allows to identify good
            broadcasters, that is key elements in a graph, depicting how
            closely the nodes are connected with each other.

        :return: closeness centrality computed for every node.
        :rtype: multiprocessing.managers.dict
        """

        graph_size = len(list(self))
        shortest_path = self.shortest_path
        shortest_path_length = self.shortest_path_length
        tot_shortest_paths_list = self.manager.list()
        node_chunks = chunk_it(list(self.nodes()), self.num)

        processes = [
            mp.Process( target=self.shortest_path_list_iteration,
            args=(node_chunks[p], shortest_path, tot_shortest_paths_list) )
            for p in range(self.num) ]

        for proc in processes:
            proc.start()

        for proc in processes:
            proc.join()

        closeness_centrality = self.manager.dict()
        self.measure_processes(closeness_centrality,
            self.closeness_centrality_kernel, shortest_path_length,
            tot_shortest_paths_list, graph_size)

        return closeness_centrality

[docs]    def compute_degree_centrality(self):
        """

        Degree centrality calculation.

        .. note:: Degree centrality is a simple centrality measure that counts
            how many neighbors a node has in an undirected graph.
            The more neighbors the node has the most important it is,
            occupying a strategic position that serves as a source or conduit
            for large volumes of flux transactions with other nodes.
            A node with high degree centrality is a node with many dependencies.

        :return: degree centrality computed for every node.
        :rtype: multiprocessing.managers.dict
        """

        graph_size = len(list(self))
        degree_centrality = self.manager.dict()

        self.measure_processes(degree_centrality,
            self.degree_centrality_kernel, graph_size)
        return degree_centrality

[docs]    def compute_indegree_centrality(self):
        """

        In-degree centrality calculation.

        .. note:: In-degree centrality is measured by the number of edges
            ending at the node in a directed graph. Nodes with high in-degree
            centrality are called cascade resulting nodes.

        :return: in-degree centrality computed for every node.
        :rtype: multiprocessing.managers.dict
        """

        graph_size = len(list(self))
        indegree_centrality = self.manager.dict()

        self.measure_processes(indegree_centrality,
            self.indegree_centrality_kernel, graph_size)
        return indegree_centrality

[docs]    def compute_outdegree_centrality(self):
        """

        Out-degree centrality calculation.

        .. note:: Out-degree centrality is measured by the number of edges
            starting from a node in a directed graph. Nodes with high out-degree
            centrality are called cascade inititing nodes.

        :return: out-degree centrality computed for every node.
        :rtype: multiprocessing.managers.dict
        """

        graph_size = len(list(self))
        outdegree_centrality = self.manager.dict()

        self.measure_processes(outdegree_centrality,
            self.outdegree_centrality_kernel, graph_size)
        return outdegree_centrality