~ / teaching / InfoVis / practical works / Anscombe's quartet

Anscombe's quartet

The goal of this practical work is to build a first visualisation using D3.js.

You will recreate the famous visualisation of the Anscombe's quartet that looks like this in the original publication:

Setup

Get the archive that contains the dataset; the version of D3.js that you are going to use; and a visualisation template. Unzip it and get used to its content.

The data directory contains the datasets created by Anscombe in a machine-readable format. The (tab-separated) columns give the number of the dataset (dataset); the number of the observation (observation); and the two attributes (x and y).

The vendor directory contains the D3.js version 7.8.5 code.

The viz directory contains a visualisation template that produces a HTML table per dataset.

To test this template, you need to start a web server. The simplest way to go is using python in a terminal as shown below.

% tar xzvf anscombe.tgz
tar xzvf anscombe.tgz 
x anscombe/data/
x anscombe/data/anscombe.tsv
x anscombe/vendor/
[...]
x anscombe/viz/
x anscombe/viz/anscombe.html
% cd anscombe/
% python3 -m http.server 8000
Serving HTTP on 0.0.0.0 port 8000 ...

You can then open in a new tab your local version of the visualisation that should display the data sets as tables, as the online version does.

Code walkthrough

First, we define some styles, load D3, and we are ready for some javascript…

<!DOCTYPE html>
<html>
    <head>
        <meta charset="utf-8">
        <style type="text/css" media="screen, print">
            body  { margin: 30px 50px; font-family: sans-serif; }
            div   { float: left; margin: 10px; }
            table { border: 1px solid black; width: 12em;}
            td    { text-align: right; padding: 0 0 0 10px; }
        </style>
        <title>Anscombe's quartet</title>
    </head>
    <body>
        <h1>Anscombe's quartet</h1>

        <script src="../vendor/d3-7.8.5/dist/d3.js"></script>
        <script>

We create 4 HTML divs, one for each dataset:

var body = d3.select("body");
var divs = {
    '1': body.append('div'),
    '2': body.append('div'),
    '3': body.append('div'),
    '4': body.append('div'),
};

We define some helpers to format numbers nicely:

var s = d3.formatSpecifier("f");
s.precision = d3.precisionFixed(0.01);
var f = d3.format(s);

We are now ready to load the data, converting it on the fly to have numbers instead of strings for the x and y attributes, and processing the resulting data:

d3.tsv("../data/anscombe.tsv", d => (
    {
        dataset:     d.dataset,
        observation: d.observation,
        x: +d.x,
        y: +d.y,
    }
)).then(data => {

For each of our divs…

    for(k in divs) {
        var div = divs[k];

…we create a HTML table, and set its caption (note the use of template strings to get the value of k inserted),…

        var table = div.append('table');
        table.append('caption')
            .text(`data set ${k}`);

…we add a header row to the table…

        var header = table.append('tr');
        header.append('th')
            .text('observation');
        header.append('th')
            .text('x');
        header.append('th')
            .text('y');

…we filter the data to focus on current data set and we use the D3 selection mechanism to add to the table a row per line in the data set…

        var dataset = data.filter(d => d.dataset == k);
        var row = table.selectAll('.row')
            .data(dataset)
          .join('tr')
            .attr('class', 'row');

…and for each row, we add 3 cells containing the observation number, and the x and y attributes.

        row.append('td')
            .text(d => d.observation);
        row.append('td')
            .text(d => d.x);
        row.append('td')
            .text(d => f(d.y));

We are now done with our loop on datasets, with our dataloading, with our script and with our web page!

    };
});
        </script>
    </body>
</html>

Scatterplot creation

The visualisation will use the following visual mapping: the x and y attributes will be mapped onto the horizontal and vertical position. The final visualisation may look like that:

To do so, you will have to transform your code.

var body = d3.select("body");
var divs = {
    '1': body.append('div'),
    '2': body.append('div'),
    '3': body.append('div'),
    '4': body.append('div'),
};

You will need to compute a linear regression for the data, this function is a possible implementation that returns a linear fit of the data for the two attributes passed as parameters:

function linear_fit(data, x_attr, y_attr) {
    let n = data.length;
    let xs = data.map(d => d[x_attr]);
    let ys = data.map(d => d[y_attr]);
    let Sx  = d3.sum(xs);
    let Sxx = d3.sum(xs, x => x*x);
    let Sy  = d3.sum(ys);
    let Syy = d3.sum(ys, y => y*y);
    let Sxy = d3.sum(d3.zip(xs, ys), e => e[0]*e[1]);
    let slope = (n*Sxy-Sx*Sy)/(n*Sxx-Sx*Sx);
    let intercept = (Sy - slope*Sx)/n;
    return (x) => intercept + x*slope;
}

First we will define margins using Bostock's margin convention, and two linear scales for the x and y attributes, that will allow to convert from the data space to the screen space and back (see the tutorial):

var margin = {top: 10, right: 10, bottom: 20, left: 20},
    width  = 240 - margin.left - margin.right,
    height = 150 - margin.top - margin.bottom;

var x = d3.scaleLinear()
    .range([0, width]);

var y = d3.scaleLinear()
    .range([height, 0]);

We are now ready to process our data as usual:

var s = d3.formatSpecifier("f");
s.precision = d3.precisionFixed(0.01);
var f = d3.format(s);

d3.tsv("../data/anscombe.tsv", d => ({
    dataset:     d.dataset,
    observation: d.observation,
    x: +d.x,
    y: +d.y,
})).then(data => {

Now that we have the data, we can set the domains of our scales:

    x.domain([0, 20]).nice();
    y.domain([0, 13]).nice();

For each on of our datasets, we create an svg element with the appropriate margins and coordinate space:

    for(k in divs) {
        var div = divs[k];
        div.append('p')
            .text(`data set ${k}`);

        var svg = div.append('svg')
            .attr("width", width + margin.left + margin.right)
            .attr("height", height + margin.top + margin.bottom)
          .append("g")
            .attr("transform", `translate(${margin.left},${margin.top})`);

For each observation, we create an SVG circle of radius 2 and translate it to the appropriate position computed from the x and y attributes using the scale objects:

        var dataset = data.filter(d => d.dataset == k);
        svg.selectAll('.dot')
            .data(dataset)
          .join('circle')
            .attr('class', 'dot')
            .attr('transform', d => `translate(${x(d.x)},${y(d.y)})`)
            .attr('r', 2);

We can now compute the regression and draw a line accordingly:

        var line = d3.line()
            .x(d => x(d[0]))
            .y(d => y(d[1]));
	
        var fit = linear_fit(dataset, 'x', 'y');
        svg.append('path')
            .datum(x.domain().map(x => [x, fit(x)]))
            .attr('d', line)
            .attr('stroke-linecap', 'round')
            .attr('stroke', 'black');

And now, some D3 magic to display the axes on the graph:

        var xAxis = d3.axisBottom(x);
        svg.append('g')
            .attr('transform', `translate(0,${y(0)})`)
            .call(xAxis.ticks(20/5).tickSize(-10));
        svg.append('g')
            .attr('transform', `translate(0,${y(0)})`)
            .call(xAxis.ticks(20/1).tickSize(-5).tickFormat(''));

        var yAxis = d3.axisLeft(y);
        svg.append('g')
            .attr('transform', `translate(${x(0)})`)
            .call(yAxis.ticks(13/5).tickSize(-10));
        svg.append('g')
            .attr('transform', `translate(${x(0)})`)
            .call(yAxis.ticks(13/1).tickSize(-5).tickFormat(''));
    };
});

update: 7 oct. 2024