Use Python & Pandas to Create a D3 Force Directed Network Diagram


Our Goal

Create an interactive force directed graph to illustrate network traffic.

You may need to edit the width and height depending on the size of your network

To get started save the following code to a file named index.html to your desktop or a path you’ll remember.

index.html

<!-- python -m SimpleHTTPServer 8080 //-->
<!-- http://bl.ocks.org/mbostock/4062045 //-->

<!DOCTYPE html>
<head>
<meta charset="utf-8">
<title>Pandas D3 Force Directed Example - www.austintaylor.io</title>

<!-- JavaScript Libraries //-->
<script src="http://d3js.org/d3.v3.min.js"></script>

<!-- CSS Style //-->
<link href="http://fonts.googleapis.com/css?family=Source+Sans+Pro:300,900|Source+Code+Pro:300" rel="stylesheet" type="text/css">
<style>
body {
    font-family: 'Source Sans Pro', sans-serif;
    font-weight: 300;
}

b {
    font-weight: 900;
}

.outline {
    fill: none;
    stroke: #888888;
    stroke-width: 1px;
}

#tooltip {
    font-size: 10pt;
    font-weight: 900;

    fill: #000000;
    stroke: #ffffff;
    stroke-width: 0.25px;
}

.node {
    stroke: #ffffff;
    stroke-weight: 1px;
}

.link {
    fill: none;
    stroke: #888888;
    stroke-weight: 1px;
    stroke-opacity: 0.5;
}

.highlight {
    stroke: red;
    stroke-weight: 4px;
    stroke-opacity: 1.0;
}
</style>

<script>
var width  = 960;
var height = 500;
var margin = 20;
var pad = margin / 2;
var color = d3.scale.category20();
// Generates a tooltip for a SVG circle element based on its ID
function addTooltip(circle) {
    var x = parseFloat(circle.attr("cx"));
    var y = parseFloat(circle.attr("cy"));
    var r = parseFloat(circle.attr("r"));
    var text = circle.attr("id");
    var tooltip = d3.select("#plot")
        .append("text")
        .text(text)
        .attr("x", x)
        .attr("y", y)
        .attr("dy", -r * 2)
        .attr("id", "tooltip");
    var offset = tooltip.node().getBBox().width / 2;
    if ((x - offset) < 0) {
        tooltip.attr("text-anchor", "start");
        tooltip.attr("dx", -r);
    }
    else if ((x + offset) > (width - margin)) {
        tooltip.attr("text-anchor", "end");
        tooltip.attr("dx", r);
    }
    else {
        tooltip.attr("text-anchor", "middle");
        tooltip.attr("dx", 0);
    }
}
var vis = d3.select("#chart")
  .append("svg:svg")
    .attr("width", w)
    .attr("height", h)
    .attr("pointer-events", "all")
  .append('svg:g')
    .call(d3.behavior.zoom().on("zoom", redraw))
  .append('svg:g');
vis.append('svg:rect')
    .attr('width', w)
    .attr('height', h)
    .attr('fill', 'white');
function redraw() {
  console.log("here", d3.event.translate, d3.event.scale);
  vis.attr("transform",
      "translate(" + d3.event.translate + ")"
      + " scale(" + d3.event.scale + ")");
}
function drawGraph(graph) {
    var svg = d3.select("#force").append("svg")
        .attr("width", width)
        .attr("height", height);
    // draw plot background
    svg.append("rect")
        .attr("width", width)
        .attr("height", height)
        .style("fill", "#eeeeee");
    // create an area within svg for plotting graph
    var plot = svg.append("g")
        .attr("id", "plot")
        .attr("transform", "translate(" + pad + ", " + pad + ")");
    // https://github.com/mbostock/d3/wiki/Force-Layout#wiki-force
    var layout = d3.layout.force()
        .size([width - margin, height - margin])
        .charge(-120)
        .linkDistance(function(d, i) {
            return (d.source.group == d.target.group) ? 50 : 100;
        })
        .nodes(graph.nodes)
        .links(graph.links)
        .start();
    drawLinks(graph.links);
    drawNodes(graph.nodes);
    // add ability to drag and update layout
    // https://github.com/mbostock/d3/wiki/Force-Layout#wiki-drag
    d3.selectAll(".node").call(layout.drag);
    // https://github.com/mbostock/d3/wiki/Force-Layout#wiki-on
    layout.on("tick", function() {
        d3.selectAll(".link")
            .attr("x1", function(d) { return d.source.x; })
            .attr("y1", function(d) { return d.source.y; })
            .attr("x2", function(d) { return d.target.x; })
            .attr("y2", function(d) { return d.target.y; });
        d3.selectAll(".node")
            .attr("cx", function(d) { return d.x; })
            .attr("cy", function(d) { return d.y; });
    });
}
    function tick(e) {
  // Push different nodes in different directions for clustering.
  var k = 6 * e.alpha;
  graph.nodes.forEach(function(o, i) {
    o.y += i & 1 ? k : -k;
    o.x += i & 2 ? k : -k;
  });
  node.attr("cx", function(d) { return d.x; })
      .attr("cy", function(d) { return d.y; });
}
// Draws nodes on plot
function drawNodes(nodes) {
    // used to assign nodes color by group
    var color = d3.scale.category20();
    // https://github.com/mbostock/d3/wiki/Force-Layout#wiki-nodes
    d3.select("#plot").selectAll(".node")
        .data(nodes)
        .enter()
        .append("circle")
        .attr("class", "node")
        .attr("id", function(d, i) { return d.name; })
        .attr("cx", function(d, i) { return d.x; })
        .attr("cy", function(d, i) { return d.y; })
        .attr("r",  function(d, i) { return 4; })
        .style("fill",   function(d, i) { return color(d.group); })
        .on("mouseover", function(d, i) { addTooltip(d3.select(this)); })
        .on("mouseout",  function(d, i) { d3.select("#tooltip").remove(); });
}
// Draws edges between nodes
function drawLinks(links) {
    var scale = d3.scale.linear()
        .domain(d3.extent(links, function(d, i) {
           return d.value;
        }))
        .range([1, 6]);
    // https://github.com/mbostock/d3/wiki/Force-Layout#wiki-links
    d3.select("#plot").selectAll(".link")
        .data(links)
        .enter()
        .append("line")
        .attr("class", "link")
        .attr("x1", function(d) { return d.source.x; })
        .attr("y1", function(d) { return d.source.y; })
        .attr("x2", function(d) { return d.target.x; })
        .attr("y2", function(d) { return d.target.y; })
        .style("stroke-width", function(d, i) {
            return scale(d.value) + "px";
        })
        .style("stroke-dasharray", function(d, i) {
            return (d.value <= 1) ? "2, 2" : "none";
        });
}
</script>
</head>

<body>
<div align="center" id="force"></div>

<script>
d3.json("pcap_export.json", drawGraph);
</script>
</body>
</html>

It’s easiest if the dataset and index.html are all in the same directory.

The dataset we’re going to use is from a SANS Holiday Challenge in 2013 which is available here

Getting started…


Use Python & Pandas to Create a D3 Force Directed Network Diagram

Required:

  • Python 3 or Python 2: Required Modules
    • Pandas: pip install pandas
    • IP Address Module: pre-installed with Python 3.x or Python 2.x
  • A text editor: Your choice
    • My Favorites: Sublime Text 3, iPython Notebook
  • Optional: You can get iPython Notebook and Pandas together by installing Anaconda 3

Step 1: Extract Data

In this example, we’re going to export the metadata from our PCAP using wireshark.

Set your filter
Type ip into the filter for IPv4 addresses

Set Filter
Mark the packets for export.
Edit > Mark All Displayed

Mark Packets for Export

Save/Export packets as CSV format.
File > Export Packet Dissections > Save as CSV Export Marked Packets

Name your file something you’ll remember. I named mine packet_metadata.csv

Step 2: Transform Data

Now we need to get the data into a dataframe. If you’ve never used Pandas before there is a great tutorial here.

Getting our data into a dataframe is simple with Panda’s read_csv module.

import pandas as pd
import json
import re

pcap_data = pd.read_csv('packet_metadata_ipv4.csv', index_col='No.')

Verify data loaded properly

dataframe = pcap_data


dataframe
Time Source Src Port Destination Dst Port Protocol Length
No.
1 0.000000 10.25.22.253 2546 10.25.22.250 80 TCP 62
2 0.000035 10.25.22.250 80 10.25.22.253 2546 TCP 62
3 0.000225 10.25.22.253 2546 10.25.22.250 80 TCP 60
4 0.000455 10.25.22.253 2546 10.25.22.250 80 HTTP 360
5 0.000482 10.25.22.250 80 10.25.22.253 2546 TCP 54
6 0.000957 10.25.22.250 80 10.25.22.253 2546 HTTP 1315
7 0.003018 10.25.22.253 2546 10.25.22.250 80 HTTP 340
8 0.003181 10.25.22.250 80 10.25.22.253 2546 TCP 1514
9 0.003298 10.25.22.250 80 10.25.22.253 2546 HTTP 1194
10 0.003531 10.25.22.253 2546 10.25.22.250 80 TCP 60
11 0.039293 10.25.22.253 2546 10.25.22.250 80 HTTP 344
12 0.039672 10.25.22.250 80 10.25.22.253 2546 HTTP 579
13 0.092320 10.25.22.253 2546 10.25.22.250 80 HTTP 267
14 0.092593 10.25.22.250 80 10.25.22.253 2546 HTTP 575
15 0.303521 10.25.22.253 2546 10.25.22.250 80 TCP 60
16 4.466444 10.25.22.253 2546 10.25.22.250 80 HTTP 442
17 4.466706 10.25.22.250 80 10.25.22.253 2546 TCP 14654
18 4.466912 10.25.22.253 2546 10.25.22.250 80 TCP 60
19 4.466927 10.25.22.250 80 10.25.22.253 2546 TCP 16114
20 4.467095 10.25.22.253 2546 10.25.22.250 80 TCP 60

We’ll want to structure our data in the same format as the infamous miserables.json

Here is a sample of miserables.json

json_data = {
  "nodes":[
    {"name":"Myriel","group":1},
    {"name":"Napoleon","group":1},
    {"name":"Mlle.Baptistine","group":1},
    {"name":"Mme.Magloire","group":1},
    {"name":"CountessdeLo","group":1},
  ],
  "links":[
    {"source":1,"target":0,"value":1},
    {"source":2,"target":0,"value":8},
    {"source":3,"target":0,"value":10},
    {"source":3,"target":2,"value":6},
    {"source":4,"target":0,"value":1},
    {"source":5,"target":0,"value":1},
  ]
}

-Nodes: This data is used to create an object and give the node a name. The group represents the color.
-Links: The source is used to identify the index position inside of the nodes list. For example “Napoleon” is in index position 1; same holds true for target. The value is the number of times the connection occurs.

Let’s get our PCAP data into the same format.

First, isolate source and destination.

src_dst = dataframe[["Source","Destination"]]

Load 10 sample pieces of data from the dataframe to validate data.

src_dst.sample(10)
Source Destination
No.
58224 10.16.92.103 10.16.92.79
37454 10.16.92.103 10.16.92.79
22425 10.16.92.79 10.16.92.103
72515 10.16.92.79 10.16.92.103
124518 10.16.92.103 10.16.92.79
93352 10.16.92.103 10.16.92.79
166810 10.16.92.79 10.16.92.103
73159 10.16.92.103 10.16.92.79
114681 10.16.92.79 10.16.92.103
156581 10.16.92.103 10.16.92.79


Filter out any hostnames that were included (may not apply to your dataset):

def ip_matcher(address):
    # Used to validate if string is an ipaddress
    ip = re.match(
        '^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$', address)
    if ip:
        return True
    else:
        return False

src_dst.rename(columns={"Source":"source","Destination":"target"}, inplace=True)
src_dst['valid_src'] = src_dst.source.apply(ip_matcher)
src_dst['valid_target'] = src_dst.target.apply(ip_matcher)

valid_src_dest = src_dst[(src_dst.valid_src==True) & (src_dst.valid_target==True)]

Group by source and target fields and count number of connections

Use inplace=True to rename the columns inplace without having to reassign to a new variable.

grouped_src_dst = valid_src_dest.groupby(["source","target"]).size().reset_index()

Join source and target into consolidated index to be used for index position

unique_ips = pd.Index(grouped_src_dst['source']
                      .append(grouped_src_dst['target'])
                      .reset_index(drop=True).unique())

Create subnet group
Note: We use regular expression here to group the various subnets to the third octect. For example, if you have 2 IP addresses (192.168.1.5, 192.168.2.5), they’d both be treated as 2 networks. We’ll use this to group the subnets by color and create our groups.

group_dict = {}
counter = 0
for ip in unique_ips:
    breakout_ip = re.match("^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$", ip)
    if breakout_ip:
        net_id = '.'.join(breakout_ip.group(1,2,3))
        if net_id not in group_dict:
            counter += 1
            group_dict[net_id] = counter
        else:
            pass

Next we’ll need to begin to structure our data which to reference later.

grouped_src_dst.rename(columns={0:'count'}, inplace=True)
temp_links_list = list(grouped_src_dst.apply(lambda row: {"source": row['source'], "target": row['target'], "value": row['count']}, axis=1))

You should now have something like…

temp_links_list
[{'source': '0.0.0.0', 'target': '255.255.255.255', 'value': 157},
 {'source': '10.16.11.5', 'target': '10.25.22.253', 'value': 24},
 {'source': '10.16.92.103', 'target': '10.16.92.79', 'value': 105742},
 {'source': '10.16.92.79', 'target': '10.16.92.103', 'value': 36543},
 {'source': '10.2.2.2', 'target': '10.22.11.9', 'value': 3410},
 {'source': '10.2.2.2', 'target': '10.25.22.253', 'value': 57},
 {'source': '10.21.22.1', 'target': '10.21.22.22', 'value': 1},
 {'source': '10.21.22.1', 'target': '10.21.22.23', 'value': 1},
 {'source': '10.21.22.1', 'target': '10.21.22.24', 'value': 1},
 {'source': '10.21.22.1', 'target': '10.21.22.253', 'value': 19},
 {'source': '10.21.22.10', 'target': '10.21.22.22', 'value': 54},
 {'source': '10.21.22.10', 'target': '10.21.22.23', 'value': 96},
 {'source': '10.21.22.10', 'target': '10.21.22.24', 'value': 156},
 {'source': '10.21.22.10', 'target': '10.21.22.253', 'value': 14},
 {'source': '10.21.22.22', 'target': '10.21.22.1', 'value': 3},
 {'source': '10.21.22.22', 'target': '10.21.22.10', 'value': 40},
 {'source': '10.21.22.22', 'target': '10.21.22.23', 'value': 6},
 {'source': '10.21.22.22', 'target': '10.21.22.24', 'value': 6},
 {'source': '10.21.22.22', 'target': '10.21.22.253', 'value': 20}]
 

Now we need to extract the index location for each unique source and destination (target) pair and append it to our links list.

links_list = []
for link in temp_links_list:
    record = {"value":link['value'], "source":unique_ips.get_loc(link['source']),
     "target": unique_ips.get_loc(link['target'])}
    links_list.append(record)

Now that we have our links list, we’ll need to create our nodes.

nodes_list = []

for ip in unique_ips:
    breakout_ip = re.match("^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$", ip)
    if breakout_ip:
        net_id = '.'.join(breakout_ip.group(1,2,3))
        nodes_list.append({"name":ip, "group": group_dict.get(net_id)})

Our nodes_list contains the IPs which we isolated earlier in unique_ips

validate data

nodes_list[0:8]

[
   {'group': 1, 'name': '0.0.0.0'},
   {'group': 2, 'name': '10.16.11.5'},
   {'group': 3, 'name': '10.16.92.103'},
   {'group': 3, 'name': '10.16.92.79'},
   {'group': 4, 'name': '10.2.2.2'},
   {'group': 5, 'name': '10.21.22.1'},
   {'group': 5, 'name': '10.21.22.10'},
   {'group': 5, 'name': '10.21.22.22'}
 ]

You should now see the index positions of the values instead of the values themselves represented in the links_list.

validate data

links_list

[
   {'source': 0, 'target': 58, 'value': 157},
   {'source': 1, 'target': 23, 'value': 24},
   {'source': 2, 'target': 3, 'value': 105742},
   {'source': 3, 'target': 2, 'value': 36543},
   {'source': 4, 'target': 11, 'value': 3410},
   {'source': 4, 'target': 23, 'value': 57},
   {'source': 5, 'target': 7, 'value': 1},
   {'source': 5, 'target': 8, 'value': 1},
   {'source': 5, 'target': 9, 'value': 1},
   {'source': 5, 'target': 10, 'value': 19},
   {'source': 6, 'target': 7, 'value': 54},
   {'source': 6, 'target': 8, 'value': 96},
   {'source': 6, 'target': 9, 'value': 156},
   {'source': 6, 'target': 10, 'value': 14},
   {'source': 7, 'target': 5, 'value': 3},
   {'source': 7, 'target': 6, 'value': 40},
   {'source': 7, 'target': 8, 'value': 6},
   {'source': 7, 'target': 9, 'value': 6},
   {'source': 7, 'target': 10, 'value': 20}
 ]
 

Time to prep our data to be loaded as a json and rendered in d3. This moves us into the next phase…

Step 3: Load Data

Create a variable called json_prep and assign our two list as the values.

json_prep = {"nodes":nodes_list, "links":links_list}

json_prep.keys()
   dict_keys(['links', 'nodes'])

validate data (data sample)

json_dump = json.dumps(json_prep, indent=1, sort_keys=True)
print(json_dump)
{
 "links": [
  {
   "source": 0,
   "target": 58,
   "value": 157
  },
  {
   "source": 1,
   "target": 23,
   "value": 24
  }
 ],
 "nodes": [
  {
   "group": 1,
   "name": "0.0.0.0"
  },
  {
   "group": 2,
   "name": "10.16.11.5"
  }
 ]
}

Looks good!

Finally let’s write our data out to a file to be used in our D3 Force Directed Graph

filename_out = 'pcap_export.json'
json_out = open(filename_out,'w')
json_out.write(json_dump)
json_out.close()

Finally, start a python webserver

Start Webserver

Open http://localhost:8000/index.html in your favorite web browser and view your network diagram!

Notebook

For convenience, I’ve included a copy of a jupyter notebook for you to follow along.

Bonus Examples

One caveat to the force directed diagram is it’s scalability. If you have a very large network you might run into browser performance issues. Here is an example of the largest diagram I have been able to render.

Interactive Force Directed Network Graph

Hope you have found this helpful. Please leave any questions in the comments below.