diff --git a/docs/data_transformation.png b/docs/data_transformation.png
deleted file mode 100644
index f204f118e4b35937233941606914c78c6073a12a..0000000000000000000000000000000000000000
Binary files a/docs/data_transformation.png and /dev/null differ
diff --git a/docs/datasets_privacy.html b/docs/datasets_privacy.html
deleted file mode 100644
index 999835c41359e10c31190d2260ace0497896a511..0000000000000000000000000000000000000000
--- a/docs/datasets_privacy.html
+++ /dev/null
@@ -1,421 +0,0 @@
-<!DOCTYPE html><html><head>
-      <title>datasets_privacy</title>
-      <meta charset="utf-8">
-      <meta name="viewport" content="width=device-width, initial-scale=1.0">
-      
-      <link rel="stylesheet" href="file:////home/boris/.atom/packages/markdown-preview-enhanced/node_modules/@shd101wyy/mume/dependencies/katex/katex.min.css">
-      
-      
-      
-      
-      
-      
-      
-      
-      
-
-      <style> 
-      /**
- * prism.js Github theme based on GitHub's theme.
- * @author Sam Clarke
- */
-code[class*="language-"],
-pre[class*="language-"] {
-  color: #333;
-  background: none;
-  font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace;
-  text-align: left;
-  white-space: pre;
-  word-spacing: normal;
-  word-break: normal;
-  word-wrap: normal;
-  line-height: 1.4;
-
-  -moz-tab-size: 8;
-  -o-tab-size: 8;
-  tab-size: 8;
-
-  -webkit-hyphens: none;
-  -moz-hyphens: none;
-  -ms-hyphens: none;
-  hyphens: none;
-}
-
-/* Code blocks */
-pre[class*="language-"] {
-  padding: .8em;
-  overflow: auto;
-  /* border: 1px solid #ddd; */
-  border-radius: 3px;
-  /* background: #fff; */
-  background: #f5f5f5;
-}
-
-/* Inline code */
-:not(pre) > code[class*="language-"] {
-  padding: .1em;
-  border-radius: .3em;
-  white-space: normal;
-  background: #f5f5f5;
-}
-
-.token.comment,
-.token.blockquote {
-  color: #969896;
-}
-
-.token.cdata {
-  color: #183691;
-}
-
-.token.doctype,
-.token.punctuation,
-.token.variable,
-.token.macro.property {
-  color: #333;
-}
-
-.token.operator,
-.token.important,
-.token.keyword,
-.token.rule,
-.token.builtin {
-  color: #a71d5d;
-}
-
-.token.string,
-.token.url,
-.token.regex,
-.token.attr-value {
-  color: #183691;
-}
-
-.token.property,
-.token.number,
-.token.boolean,
-.token.entity,
-.token.atrule,
-.token.constant,
-.token.symbol,
-.token.command,
-.token.code {
-  color: #0086b3;
-}
-
-.token.tag,
-.token.selector,
-.token.prolog {
-  color: #63a35c;
-}
-
-.token.function,
-.token.namespace,
-.token.pseudo-element,
-.token.class,
-.token.class-name,
-.token.pseudo-class,
-.token.id,
-.token.url-reference .token.variable,
-.token.attr-name {
-  color: #795da3;
-}
-
-.token.entity {
-  cursor: help;
-}
-
-.token.title,
-.token.title .token.punctuation {
-  font-weight: bold;
-  color: #1d3e81;
-}
-
-.token.list {
-  color: #ed6a43;
-}
-
-.token.inserted {
-  background-color: #eaffea;
-  color: #55a532;
-}
-
-.token.deleted {
-  background-color: #ffecec;
-  color: #bd2c00;
-}
-
-.token.bold {
-  font-weight: bold;
-}
-
-.token.italic {
-  font-style: italic;
-}
-
-
-/* JSON */
-.language-json .token.property {
-  color: #183691;
-}
-
-.language-markup .token.tag .token.punctuation {
-  color: #333;
-}
-
-/* CSS */
-code.language-css,
-.language-css .token.function {
-  color: #0086b3;
-}
-
-/* YAML */
-.language-yaml .token.atrule {
-  color: #63a35c;
-}
-
-code.language-yaml {
-  color: #183691;
-}
-
-/* Ruby */
-.language-ruby .token.function {
-  color: #333;
-}
-
-/* Markdown */
-.language-markdown .token.url {
-  color: #795da3;
-}
-
-/* Makefile */
-.language-makefile .token.symbol {
-  color: #795da3;
-}
-
-.language-makefile .token.variable {
-  color: #183691;
-}
-
-.language-makefile .token.builtin {
-  color: #0086b3;
-}
-
-/* Bash */
-.language-bash .token.keyword {
-  color: #0086b3;
-}
-
-/* highlight */
-pre[data-line] {
-  position: relative;
-  padding: 1em 0 1em 3em;
-}
-pre[data-line] .line-highlight-wrapper {
-  position: absolute;
-  top: 0;
-  left: 0;
-  background-color: transparent;
-  display: block;
-  width: 100%;
-}
-
-pre[data-line] .line-highlight {
-  position: absolute;
-  left: 0;
-  right: 0;
-  padding: inherit 0;
-  margin-top: 1em;
-  background: hsla(24, 20%, 50%,.08);
-  background: linear-gradient(to right, hsla(24, 20%, 50%,.1) 70%, hsla(24, 20%, 50%,0));
-  pointer-events: none;
-  line-height: inherit;
-  white-space: pre;
-}
-
-pre[data-line] .line-highlight:before, 
-pre[data-line] .line-highlight[data-end]:after {
-  content: attr(data-start);
-  position: absolute;
-  top: .4em;
-  left: .6em;
-  min-width: 1em;
-  padding: 0 .5em;
-  background-color: hsla(24, 20%, 50%,.4);
-  color: hsl(24, 20%, 95%);
-  font: bold 65%/1.5 sans-serif;
-  text-align: center;
-  vertical-align: .3em;
-  border-radius: 999px;
-  text-shadow: none;
-  box-shadow: 0 1px white;
-}
-
-pre[data-line] .line-highlight[data-end]:after {
-  content: attr(data-end);
-  top: auto;
-  bottom: .4em;
-}html body{font-family:"Helvetica Neue",Helvetica,"Segoe UI",Arial,freesans,sans-serif;font-size:16px;line-height:1.6;color:#333;background-color:#fff;overflow:initial;box-sizing:border-box;word-wrap:break-word}html body>:first-child{margin-top:0}html body h1,html body h2,html body h3,html body h4,html body h5,html body h6{line-height:1.2;margin-top:1em;margin-bottom:16px;color:#000}html body h1{font-size:2.25em;font-weight:300;padding-bottom:.3em}html body h2{font-size:1.75em;font-weight:400;padding-bottom:.3em}html body h3{font-size:1.5em;font-weight:500}html body h4{font-size:1.25em;font-weight:600}html body h5{font-size:1.1em;font-weight:600}html body h6{font-size:1em;font-weight:600}html body h1,html body h2,html body h3,html body h4,html body h5{font-weight:600}html body h5{font-size:1em}html body h6{color:#5c5c5c}html body strong{color:#000}html body del{color:#5c5c5c}html body a:not([href]){color:inherit;text-decoration:none}html body a{color:#08c;text-decoration:none}html body a:hover{color:#00a3f5;text-decoration:none}html body img{max-width:100%}html body>p{margin-top:0;margin-bottom:16px;word-wrap:break-word}html body>ul,html body>ol{margin-bottom:16px}html body ul,html body ol{padding-left:2em}html body ul.no-list,html body ol.no-list{padding:0;list-style-type:none}html body ul ul,html body ul ol,html body ol ol,html body ol ul{margin-top:0;margin-bottom:0}html body li{margin-bottom:0}html body li.task-list-item{list-style:none}html body li>p{margin-top:0;margin-bottom:0}html body .task-list-item-checkbox{margin:0 .2em .25em -1.8em;vertical-align:middle}html body .task-list-item-checkbox:hover{cursor:pointer}html body blockquote{margin:16px 0;font-size:inherit;padding:0 15px;color:#5c5c5c;border-left:4px solid #d6d6d6}html body blockquote>:first-child{margin-top:0}html body blockquote>:last-child{margin-bottom:0}html body hr{height:4px;margin:32px 0;background-color:#d6d6d6;border:0 none}html body table{margin:10px 0 15px 0;border-collapse:collapse;border-spacing:0;display:block;width:100%;overflow:auto;word-break:normal;word-break:keep-all}html body table th{font-weight:bold;color:#000}html body table td,html body table th{border:1px solid #d6d6d6;padding:6px 13px}html body dl{padding:0}html body dl dt{padding:0;margin-top:16px;font-size:1em;font-style:italic;font-weight:bold}html body dl dd{padding:0 16px;margin-bottom:16px}html body code{font-family:Menlo,Monaco,Consolas,'Courier New',monospace;font-size:.85em !important;color:#000;background-color:#f0f0f0;border-radius:3px;padding:.2em 0}html body code::before,html body code::after{letter-spacing:-0.2em;content:"\00a0"}html body pre>code{padding:0;margin:0;font-size:.85em !important;word-break:normal;white-space:pre;background:transparent;border:0}html body .highlight{margin-bottom:16px}html body .highlight pre,html body pre{padding:1em;overflow:auto;font-size:.85em !important;line-height:1.45;border:#d6d6d6;border-radius:3px}html body .highlight pre{margin-bottom:0;word-break:normal}html body pre code,html body pre tt{display:inline;max-width:initial;padding:0;margin:0;overflow:initial;line-height:inherit;word-wrap:normal;background-color:transparent;border:0}html body pre code:before,html body pre tt:before,html body pre code:after,html body pre tt:after{content:normal}html body p,html body blockquote,html body ul,html body ol,html body dl,html body pre{margin-top:0;margin-bottom:16px}html body kbd{color:#000;border:1px solid #d6d6d6;border-bottom:2px solid #c7c7c7;padding:2px 4px;background-color:#f0f0f0;border-radius:3px}@media print{html body{background-color:#fff}html body h1,html body h2,html body h3,html body h4,html body h5,html body h6{color:#000;page-break-after:avoid}html body blockquote{color:#5c5c5c}html body pre{page-break-inside:avoid}html body table{display:table}html body img{display:block;max-width:100%;max-height:100%}html body pre,html body code{word-wrap:break-word;white-space:pre}}.markdown-preview{width:100%;height:100%;box-sizing:border-box}.markdown-preview .pagebreak,.markdown-preview .newpage{page-break-before:always}.markdown-preview pre.line-numbers{position:relative;padding-left:3.8em;counter-reset:linenumber}.markdown-preview pre.line-numbers>code{position:relative}.markdown-preview pre.line-numbers .line-numbers-rows{position:absolute;pointer-events:none;top:1em;font-size:100%;left:0;width:3em;letter-spacing:-1px;border-right:1px solid #999;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none}.markdown-preview pre.line-numbers .line-numbers-rows>span{pointer-events:none;display:block;counter-increment:linenumber}.markdown-preview pre.line-numbers .line-numbers-rows>span:before{content:counter(linenumber);color:#999;display:block;padding-right:.8em;text-align:right}.markdown-preview .mathjax-exps .MathJax_Display{text-align:center !important}.markdown-preview:not([for="preview"]) .code-chunk .btn-group{display:none}.markdown-preview:not([for="preview"]) .code-chunk .status{display:none}.markdown-preview:not([for="preview"]) .code-chunk .output-div{margin-bottom:16px}.scrollbar-style::-webkit-scrollbar{width:8px}.scrollbar-style::-webkit-scrollbar-track{border-radius:10px;background-color:transparent}.scrollbar-style::-webkit-scrollbar-thumb{border-radius:5px;background-color:rgba(150,150,150,0.66);border:4px solid rgba(150,150,150,0.66);background-clip:content-box}html body[for="html-export"]:not([data-presentation-mode]){position:relative;width:100%;height:100%;top:0;left:0;margin:0;padding:0;overflow:auto}html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{position:relative;top:0}@media screen and (min-width:914px){html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{padding:2em calc(50% - 457px)}}@media screen and (max-width:914px){html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{padding:2em}}@media screen and (max-width:450px){html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{font-size:14px !important;padding:1em}}@media print{html body[for="html-export"]:not([data-presentation-mode]) #sidebar-toc-btn{display:none}}html body[for="html-export"]:not([data-presentation-mode]) #sidebar-toc-btn{position:fixed;bottom:8px;left:8px;font-size:28px;cursor:pointer;color:inherit;z-index:99;width:32px;text-align:center;opacity:.4}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] #sidebar-toc-btn{opacity:1}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc{position:fixed;top:0;left:0;width:300px;height:100%;padding:32px 0 48px 0;font-size:14px;box-shadow:0 0 4px rgba(150,150,150,0.33);box-sizing:border-box;overflow:auto;background-color:inherit}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc::-webkit-scrollbar{width:8px}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc::-webkit-scrollbar-track{border-radius:10px;background-color:transparent}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc::-webkit-scrollbar-thumb{border-radius:5px;background-color:rgba(150,150,150,0.66);border:4px solid rgba(150,150,150,0.66);background-clip:content-box}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc a{text-decoration:none}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc ul{padding:0 1.6em;margin-top:.8em}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc li{margin-bottom:.8em}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc ul{list-style-type:none}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .markdown-preview{left:300px;width:calc(100% -  300px);padding:2em calc(50% - 457px -  150px);margin:0;box-sizing:border-box}@media screen and (max-width:1274px){html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .markdown-preview{padding:2em}}@media screen and (max-width:450px){html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .markdown-preview{width:100%}}html body[for="html-export"]:not([data-presentation-mode]):not([html-show-sidebar-toc]) .markdown-preview{left:50%;transform:translateX(-50%)}html body[for="html-export"]:not([data-presentation-mode]):not([html-show-sidebar-toc]) .md-sidebar-toc{display:none}
-/* Please visit the URL below for more information: */
-/*   https://shd101wyy.github.io/markdown-preview-enhanced/#/customize-css */
- 
-      </style>
-    </head>
-    <body for="html-export">
-      <div class="mume markdown-preview   ">
-      <h1 class="mume-header" id="datasets-privacy">Datasets Privacy</h1>
-
-<h2 class="mume-header" id="introduction">Introduction</h2>
-
-<p>This document presents the datasets generated for Scava, discusses the implications it has regarding privacy, and describes what has been achieved to ensure data is safe.</p>
-<p>All datasets are anonymised: fields that could be used to identify individuals or companies either directly or indirectly have been transformed using the <a href="https://github.com/borisbaldassari/data-anonymiser">Anonymise::Utility Perl module</a>.</p>
-<p>The intended audience of the datasets is composed of:</p>
-<ul>
-<li>Research laboratories, mainly in the field of software engineering.</li>
-<li>Software engineering practitioners, who may find useful to have real-world examples of software development projects.</li>
-</ul>
-<p>Should one have questions or remarks on the datasets, please <a href="https://www.crossminer.org/contact">feel free to contact us</a>. All cases related to privacy will be handled with utmost diligence.</p>
-<h2 class="mume-header" id="description-of-the-datasets">Description of the datasets</h2>
-
-<p>There are three types of datasets generated, each with its specific schema and attributes. The first step to preserve privacy is to describe the various datasets and their attributes, and identify what field could pose a threat.</p>
-<h3 class="mume-header" id="aeri-stacktraces">AERI stacktraces</h3>
-
-<p>The <a href="http://download.eclipse.org/scava/datasets/aeri_stacktraces/aeri_stacktraces.html">AERI stacktraces dataset</a> contains information about exceptions encountered by users in the Eclipse IDE. It includes data about the exception itself, and the environment where it happened.</p>
-<p>The <a href="../datasets/aeri_stacktraces/aeri_stacktraces.html#format-incidents">incidents dataset</a> offers the following attributes:</p>
-<ul>
-<li><strong>Message</strong> (String) A short text summarising the error.</li>
-<li><strong>Code</strong> (Integer) The numeric status code logged with the error.</li>
-<li><strong>Severity</strong> (Factors) An estimate by the user reporting the error about its perceived severity.</li>
-<li><strong>Kind</strong> (Factors) The type of error recorded, as identified by the AERI system.</li>
-<li><strong>Plugin ID</strong> (String) The ID of the Eclipse plugin that threw the exception.</li>
-<li><strong>Plugin Version</strong> (String) The ID of the Eclipse plugin that threw the exception.</li>
-<li><strong>Status fingerprint</strong> (String) An identifier for the status of the incident. Used for duplicates detection.</li>
-<li><strong>Incident fingerprint</strong> (String) An identifier for the incident. Used for duplicates detection.</li>
-<li><strong>Incident fingerprint2</strong> (String) An identifier for the incident. Used for duplicates detection.</li>
-<li><strong>Timestamp</strong> (Date ISO 8601) The time of creation of the incident.</li>
-<li><strong>Saved On</strong> (Date ISO 8601) The time of last save of the problem.</li>
-<li><strong>OSGi Architecture</strong> (Factors) The architecture of the host, as specified in the OSGi bundle definition.</li>
-<li><strong>OSGi OS</strong> (Factors) The host operating system, as reported in OSGi.</li>
-<li><strong>OSGi OS Version</strong> (Factors) The host operating system version, as reported in OSGi.</li>
-<li><strong>OSGi Window Manager</strong> (Factors) The Window Manager used by the host, as reported in OSGi.</li>
-<li><strong>Eclipse Build ID</strong> (String) The Build ID of the Eclipse instance running when the exception occurred.</li>
-<li><strong>Eclipse Product</strong> (String) The Eclipse product impacted by the exception.</li>
-<li><strong>Java runtime version</strong> (String) The Java runtime of the host.</li>
-<li><strong>Comment Quality</strong> (Factors) An estimate of the user comment&#x2019;s quality (throughfulness). User comments help people better understand the context of the exception.</li>
-</ul>
-<p>The <a href="../datasets/aeri_stacktraces/aeri_stacktraces.html#format-problems">problems dataset</a> offers the following attributes:</p>
-<ul>
-<li><strong>Summary</strong> (String) A short text summarising the error.</li>
-<li><strong>Number of reporters</strong> (Integer) The number of people who reported this incident or problem.</li>
-<li><strong>Number of incidents</strong> (integer) The number of times this problem was identified in incidents.</li>
-<li><strong>V1 Status</strong> (Factors) The status of the problem attached to the error report.</li>
-<li><strong>Kind</strong> (Factors) The type of error recorded, as identified by the AERI system.</li>
-<li><strong>Created On</strong> (Date ISO 8601) The time of first appearance of the problem in an incident.</li>
-<li><strong>Modified On</strong> (Date ISO 8601) The time of last update of the problem in an incident.</li>
-<li><strong>Saved On</strong> (Date ISO 8601) The time of last save of the problem.</li>
-<li><strong>OSGi Architecture</strong> (Factors) The architecture of the host, as specified in the OSGi bundle definition.</li>
-<li><strong>OSGi OS</strong> (Factors) The host operating system, as reported in OSGi.</li>
-<li><strong>OSGi OS Version</strong> (Factors) The host operating system version, as reported in OSGi.</li>
-<li><strong>OSGi Window Manager</strong> (Factors) The Window Manager used by the host, as reported in OSGi.</li>
-<li><strong>Eclipse Build ID</strong> (String) The Build ID of the Eclipse instance running when the exception occurred.</li>
-<li><strong>Eclipse Product</strong> (String) The Eclipse product impacted by the exception.</li>
-<li><strong>Java runtime version</strong> (String) The Java runtime of the host.</li>
-</ul>
-<p>The <a href="http://download.eclipse.org/scava/datasets/aeri_stacktraces/incidents_bundles_extract.csv.bz2">incidents bundle</a> offers the following attributes:</p>
-<ul>
-<li><strong>Bundle name</strong> (String) The name of the bundle impacted by the exception.</li>
-<li><strong>Bundle version</strong> (String) The version of the bundle impacted by the exception.</li>
-<li><strong>Value</strong> (Integer) The number of times the exception appeared for this bundle (name + version).</li>
-</ul>
-<h3 class="mume-header" id="eclipse-mailing-lists">Eclipse Mailing lists</h3>
-
-<p>The <a href="../datasets/eclipse_mls/mbox_analysis.html">Eclipse mailing lists dataset</a> offers the following attributes:</p>
-<ul>
-<li><strong>List</strong> (String) The mailing list and project of the post.</li>
-<li><strong>messageId</strong> (String) A unique identifier for the post.</li>
-<li><strong>Subject</strong> (String) The subject of the post as sent on the mailing list.</li>
-<li><strong>Sent at</strong> (Date ISO 8601) The time of sending for the post.</li>
-<li><span style="color:red;font-size:120%"> &#x2623;&#xFE0F; </span> <strong>Sender name</strong> (String) The name of the sender of the post. Names are obfuscated, e.g. <code>HKmwHIC4dREThJRj</code>.</li>
-<li><span style="color:red;font-size:120%"> &#x2623;&#xFE0F; </span> <strong>Sender address</strong> (String) The email address of the sender of the post. Email address is obfuscated, e.g. <code>xzrEaN24LhYew151@HAYhBP6A1UVpXiHt</code>.</li>
-</ul>
-<h3 class="mume-header" id="eclipse-projects-extracts">Eclipse projects extracts</h3>
-
-<p>The <a href="../datasets/projects/eclipse_projects.html">Eclipse projects extracts</a> have different sets of data depending on the sources available for each project. We list thereafter the full list of extracts, highlighting attributes that include privacy-related information.</p>
-<p>Git (Software Configuration Management)</p>
-<ul>
-<li><strong>git_commits_evol.csv</strong> contains the daily number of commits and distinct authors.</li>
-<li><span style="color:red;font-size:120%"> &#x2623;&#xFE0F; </span> <strong>git_log.txt</strong> contains the retranscription of the <code>git log</code>command, including the name and email of commit authors. Name is replaced by XXX&apos;s and email address is obfuscated, e.g. <code>xzrEaN24LhYew151@HAYhBP6A1UVpXiHt</code>.</li>
-</ul>
-<p>Bugzilla (Issue tracking)</p>
-<ul>
-<li><strong>bugzilla_components.csv</strong> contains the number of issues submitted against each component.</li>
-<li><strong>bugzilla_evol.csv</strong> contains the daily number of issues submitted and distinct authors.</li>
-<li><span style="color:red;font-size:120%"> &#x2623;&#xFE0F; </span> <strong>bugzilla_issues.csv</strong> contains the list of issues for the project, including the emails of the author and the assignee for each submitteed issue. Emails are obfuscated, e.g. <code>xzrEaN24LhYew151@HAYhBP6A1UVpXiHt</code>.</li>
-<li><span style="color:red;font-size:120%"> &#x2623;&#xFE0F; </span> <strong>bugzilla_issues_open.csv</strong> contains the list of open issues for the project, including the emails of the author and the assignee for each submitteed issue. Emails are obfuscated, e.g. <code>xzrEaN24LhYew151@HAYhBP6A1UVpXiHt</code>.</li>
-</ul>
-<p>Forums (User-oriented communication)</p>
-<ul>
-<li><strong>eclipse_forums_posts.csv</strong> contains the full list of posts on the project&apos;s forum. It includes an Integer representation of the author of the post as returned by the API (no obfuscation needed).</li>
-<li><strong>eclipse_forums_threads.csv</strong> contains the full list of posts on the project&apos;s forum. It includes an Integer representation of the first and last author of the thread, as returned by the API (no obfuscation needed).</li>
-</ul>
-<p>PMI (project metadata)</p>
-<ul>
-<li><strong>eclipse_pmi_checks.csv</strong> contains a list of checks (values, usefulness, consistency) applied to the Project Management Infrastructure record for the project.</li>
-</ul>
-<p>SonarQube (code analysis)</p>
-<ul>
-<li><strong>sq_issues_blocker.csv</strong> contains the list of SonarQube issues with severity set to blocker.</li>
-<li><strong>sq_issues_blocker.csv</strong> contains the list of SonarQube issues with severity set to critical.</li>
-<li><strong>sq_issues_blocker.csv</strong> contains the list of SonarQube issues with severity set to major.</li>
-<li><strong>sq_metrics.csv</strong> contains the list of metrics computed by Sonarqube.</li>
-</ul>
-<h2 class="mume-header" id="anonymisation">Anonymisation</h2>
-
-<p>The mechanism used to anonymise the data is the <a href="https://github.com/borisbaldassari/data-anonymiser">Anonymise::Utility Perl module</a>. It basically uses asymmetric encryption to generate a one-off mapping between clear IDs and obfuscated strings.</p>
-<p><img src="./data_transformation.png" alt="Data transformation" title="Data transformation"></p>
-<p>The private key is thrown away, preventing any recovering of the encrypted IDs. This technique has several advantages:</p>
-<ul>
-<li>Identical clear-text strings are translated to the same obfuscated string. This enables researchers and analysts to identify same occurrences of an item without any information about its actual content.</li>
-<li>The private key is thrown away immediately, making it impossible for an attacker to use it to decrypt the dataset. The algorithm used is the <a href="https://metacpan.org/pod/Crypt::PK::RSA">Perl implementation of RSA</a>, which is considered reasonably strong for our purpose.</li>
-<li>The public key is re-generated for each session, making it impossible for an attacker to rebuild the mapping or use rainbow tables.</li>
-</ul>
-<p><strong>The resulting datasets contain no email address, names, user id or machine id.</strong></p>
-<h2 class="mume-header" id="privacy-compliance">Privacy compliance</h2>
-
-<p>The management and publication of data in the European Union is regulated by the <strong>General Data Protection Regulation</strong> (GDPR) directive, which also addresses the export of data outside the EU and EEA areas. Since we are EU citizens -- and considering also that the Crossminer project is funded by the H2020 EU research program -- we are to abide by this regulation. Besides the legal implications of publishing open datasets, we are willing to make sure that everybody, individuals or companies, involved in the data is safe.</p>
-<p>In the case of software engineering data, there is a <a href="https://github.com/dspinellis/awesome-msr">huge amount of public information</a> readily available without any restrictions. Most, if not all, tools used in the open-source world provide information about who did what and when -- which is undoubtely useful for collaboration and community. It is also mandatory regarding intellectual property processes: when one contributes a file to an open-source project, it is at the very least good practice to put her name (and maybe email address) in the header of the file along the licence used. When Intellectual Property is an important concern, like for the Eclipse Foundation, it simply is <em>required</em> since we need to know who that work belongs to in the case of IP issues and legal lawsuite cases.</p>
-<p>The publication of open data in this context, i.e. with the original data being already publicly available from public tools, is a specific case of the GDPR and it is hard to find any reliable information about how it should be conducted. As a result we relied on similar studies and articles and proceeded on a best-effort basis to provide datasets to our users which are as useful and safe as possible.</p>
-<p>Considering that:</p>
-<ul>
-<li><strong>Original data is already publicly available</strong> through the tools themselves (Git, Bugzilla, Mailing lists and forums) and their APIs.</li>
-<li>We provide a <strong>complete description</strong> of the content of the datasets, <strong>identifying the risks</strong> and <strong>describing the mitigation steps</strong> we went through to ensure that the data is safe.</li>
-<li>To the best of our knowledge <strong>there is now way to decrypt or reverse-engineer the obfuscated information</strong>. The method used for anonymisation is so strong that only knowing the original data could help re-identifying it.</li>
-</ul>
-<p>Considering also that:</p>
-<ul>
-<li>The goal of this processing is to provide <strong>free and open resources to help scientific research</strong>, which is in the <strong>public interest</strong> as defined in <a href="https://gdpr-info.eu/art-6-gdpr/">Article 6.1 (e)</a>.</li>
-<li>The Eclipse forge hosts open source and collaborative projects only, and all contributions are made under a <strong>required signed agreement</strong> known as the <a href="https://www.eclipse.org/legal/ECA.php">Eclipse Contributor Agreement</a>: people explicitely and knowingly give their consent to make their contribution public.</li>
-</ul>
-<p>We assume that both the <strong>data itself and its publication are safe</strong>, regarding both the users and the current regulation.</p>
-<h2 class="mume-header" id="references">References</h2>
-
-<ul>
-<li><a href="https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32016R0679&amp;from=EN">GDPR official text (HTML)</a></li>
-<li><a href="https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32016R0679&amp;from=EN">GDPR official text (PDF)</a></li>
-<li><a href="https://blogs.openaire.eu/?p=3248">GDPR and the research process: What you need to know</a></li>
-<li><a href="https://blog.infinigate.co.uk/gdpr-personal-data-public-domain">GDPR &amp; Personal Data in the Public Domain</a></li>
-<li><a href="https://www.europeandataportal.eu/en/highlights/how-address-privacy-concerns-when-opening-data">How to address privacy concerns when openin data</a></li>
-</ul>
-
-      </div>
-      
-      
-    
-    
-    
-    
-    
-    
-    
-    
-  
-    </body></html>
\ No newline at end of file
diff --git a/docs/datasets_privacy.md b/docs/datasets_privacy.md
deleted file mode 100644
index 91b5cec4537fc14526bd2e0517ef7b83f7dbff31..0000000000000000000000000000000000000000
--- a/docs/datasets_privacy.md
+++ /dev/null
@@ -1,156 +0,0 @@
-
-# Datasets Privacy
-
-## Introduction
-
-This document presents the datasets generated for Scava, discusses the implications it has regarding privacy, and describes what has been achieved to ensure data is safe.
-
-All datasets are anonymised: fields that could be used to identify individuals or companies either directly or indirectly have been transformed using the [Anonymise::Utility Perl module](https://github.com/borisbaldassari/data-anonymiser).
-
-The intended audience of the datasets is composed of:
-
-* Research laboratories, mainly in the field of software engineering.
-* Software engineering practitioners, who may find useful to have real-world examples of software development projects.
-
-Should one have questions or remarks on the datasets, please [feel free to contact us](https://www.crossminer.org/contact). All cases related to privacy will be handled with utmost diligence.
-
-
-## Description of the datasets
-
-There are three types of datasets generated, each with its specific schema and attributes. The first step to preserve privacy is to describe the various datasets and their attributes, and identify what field could pose a threat.
-
-
-### AERI stacktraces
-
-The [AERI stacktraces dataset](http://download.eclipse.org/scava/datasets/aeri_stacktraces/aeri_stacktraces.html) contains information about exceptions encountered by users in the Eclipse IDE. It includes data about the exception itself, and the environment where it happened.
-
-The [incidents dataset](../datasets/aeri_stacktraces/aeri_stacktraces.html#format-incidents) offers the following attributes:
-
-* **Message** (String) A short text summarising the error.
-* **Code** (Integer) The numeric status code logged with the error.
-* **Severity** (Factors) An estimate by the user reporting the error about its perceived severity.
-* **Kind** (Factors) The type of error recorded, as identified by the AERI system.
-* **Plugin ID** (String) The ID of the Eclipse plugin that threw the exception.
-* **Plugin Version** (String) The ID of the Eclipse plugin that threw the exception.
-* **Status fingerprint** (String) An identifier for the status of the incident. Used for duplicates detection.
-* **Incident fingerprint** (String) An identifier for the incident. Used for duplicates detection.
-* **Incident fingerprint2** (String) An identifier for the incident. Used for duplicates detection.
-* **Timestamp** (Date ISO 8601) The time of creation of the incident.
-* **Saved On** (Date ISO 8601) The time of last save of the problem.
-* **OSGi Architecture** (Factors) The architecture of the host, as specified in the OSGi bundle definition.
-* **OSGi OS** (Factors) The host operating system, as reported in OSGi.
-* **OSGi OS Version** (Factors) The host operating system version, as reported in OSGi.
-* **OSGi Window Manager** (Factors) The Window Manager used by the host, as reported in OSGi.
-* **Eclipse Build ID** (String) The Build ID of the Eclipse instance running when the exception occurred.
-* **Eclipse Product** (String) The Eclipse product impacted by the exception.
-* **Java runtime version** (String) The Java runtime of the host.
-* **Comment Quality** (Factors) An estimate of the user comment’s quality (throughfulness). User comments help people better understand the context of the exception.
-
-The [problems dataset](../datasets/aeri_stacktraces/aeri_stacktraces.html#format-problems) offers the following attributes:
-
-* **Summary** (String) A short text summarising the error.
-* **Number of reporters** (Integer) The number of people who reported this incident or problem.
-* **Number of incidents** (integer) The number of times this problem was identified in incidents.
-* **V1 Status** (Factors) The status of the problem attached to the error report.
-* **Kind** (Factors) The type of error recorded, as identified by the AERI system.
-* **Created On** (Date ISO 8601) The time of first appearance of the problem in an incident.
-* **Modified On** (Date ISO 8601) The time of last update of the problem in an incident.
-* **Saved On** (Date ISO 8601) The time of last save of the problem.
-* **OSGi Architecture** (Factors) The architecture of the host, as specified in the OSGi bundle definition.
-* **OSGi OS** (Factors) The host operating system, as reported in OSGi.
-* **OSGi OS Version** (Factors) The host operating system version, as reported in OSGi.
-* **OSGi Window Manager** (Factors) The Window Manager used by the host, as reported in OSGi.
-* **Eclipse Build ID** (String) The Build ID of the Eclipse instance running when the exception occurred.
-* **Eclipse Product** (String) The Eclipse product impacted by the exception.
-* **Java runtime version** (String) The Java runtime of the host.
-
-The [incidents bundle](http://download.eclipse.org/scava/datasets/aeri_stacktraces/incidents_bundles_extract.csv.bz2) offers the following attributes:
-
-* **Bundle name** (String) The name of the bundle impacted by the exception.
-* **Bundle version** (String) The version of the bundle impacted by the exception.
-* **Value** (Integer) The number of times the exception appeared for this bundle (name + version).
-
-
-### Eclipse Mailing lists
-
-The [Eclipse mailing lists dataset](../datasets/eclipse_mls/mbox_analysis.html) offers the following attributes:
-
-* **List** (String) The mailing list and project of the post.
-* **messageId** (String) A unique identifier for the post.
-* **Subject** (String) The subject of the post as sent on the mailing list.
-* **Sent at** (Date ISO 8601) The time of sending for the post.
-* <span style="color:red;font-size:120%"> :biohazard: </span> **Sender name** (String) The name of the sender of the post. Names are obfuscated, e.g. `HKmwHIC4dREThJRj`.
-* <span style="color:red;font-size:120%"> :biohazard: </span> **Sender address** (String) The email address of the sender of the post. Email address is obfuscated, e.g. `xzrEaN24LhYew151@HAYhBP6A1UVpXiHt`.
-
-
-### Eclipse projects extracts
-
-The [Eclipse projects extracts](../datasets/projects/eclipse_projects.html) have different sets of data depending on the sources available for each project. We list thereafter the full list of extracts, highlighting attributes that include privacy-related information.
-
-Git (Software Configuration Management)
-* **git_commits_evol.csv** contains the daily number of commits and distinct authors.
-* <span style="color:red;font-size:120%"> :biohazard: </span> **git_log.txt** contains the retranscription of the `git log`command, including the name and email of commit authors. Name is replaced by XXX's and email address is obfuscated, e.g. `xzrEaN24LhYew151@HAYhBP6A1UVpXiHt`.
-
-Bugzilla (Issue tracking)
-* **bugzilla_components.csv** contains the number of issues submitted against each component.
-* **bugzilla_evol.csv** contains the daily number of issues submitted and distinct authors.
-* <span style="color:red;font-size:120%"> :biohazard: </span> **bugzilla_issues.csv** contains the list of issues for the project, including the emails of the author and the assignee for each submitteed issue. Emails are obfuscated, e.g. `xzrEaN24LhYew151@HAYhBP6A1UVpXiHt`.
-* <span style="color:red;font-size:120%"> :biohazard: </span> **bugzilla_issues_open.csv** contains the list of open issues for the project, including the emails of the author and the assignee for each submitteed issue. Emails are obfuscated, e.g. `xzrEaN24LhYew151@HAYhBP6A1UVpXiHt`.
-
-Forums (User-oriented communication)
-* **eclipse_forums_posts.csv** contains the full list of posts on the project's forum. It includes an Integer representation of the author of the post as returned by the API (no obfuscation needed).
-* **eclipse_forums_threads.csv** contains the full list of posts on the project's forum. It includes an Integer representation of the first and last author of the thread, as returned by the API (no obfuscation needed).
-
-PMI (project metadata)
-* **eclipse_pmi_checks.csv** contains a list of checks (values, usefulness, consistency) applied to the Project Management Infrastructure record for the project.
-
-SonarQube (code analysis)
-* **sq_issues_blocker.csv** contains the list of SonarQube issues with severity set to blocker.
-* **sq_issues_blocker.csv** contains the list of SonarQube issues with severity set to critical.
-* **sq_issues_blocker.csv** contains the list of SonarQube issues with severity set to major.
-* **sq_metrics.csv** contains the list of metrics computed by Sonarqube.
-
-
-## Anonymisation
-
-The mechanism used to anonymise the data is the [Anonymise::Utility Perl module](https://github.com/borisbaldassari/data-anonymiser). It basically uses asymmetric encryption to generate a one-off mapping between clear IDs and obfuscated strings.
-
-![Data transformation](./data_transformation.png "Data transformation")
-
-The private key is thrown away, preventing any recovering of the encrypted IDs. This technique has several advantages:
-
-* Identical clear-text strings are translated to the same obfuscated string. This enables researchers and analysts to identify same occurrences of an item without any information about its actual content.
-* The private key is thrown away immediately, making it impossible for an attacker to use it to decrypt the dataset. The algorithm used is the [Perl implementation of RSA](https://metacpan.org/pod/Crypt::PK::RSA), which is considered reasonably strong for our purpose.
-* The public key is re-generated for each session, making it impossible for an attacker to rebuild the mapping or use rainbow tables.
-
-**The resulting datasets contain no email address, names, user id or machine id.**
-
-
-## Privacy compliance
-
-The management and publication of data in the European Union is regulated by the **General Data Protection Regulation** (GDPR) directive, which also addresses the export of data outside the EU and EEA areas. Since we are EU citizens -- and considering also that the Crossminer project is funded by the H2020 EU research program -- we are to abide by this regulation. Besides the legal implications of publishing open datasets, we are willing to make sure that everybody, individuals or companies, involved in the data is safe.
-
-In the case of software engineering data, there is a [huge amount of public information](https://github.com/dspinellis/awesome-msr) readily available without any restrictions. Most, if not all, tools used in the open-source world provide information about who did what and when -- which is undoubtely useful for collaboration and community. It is also mandatory regarding intellectual property processes: when one contributes a file to an open-source project, it is at the very least good practice to put her name (and maybe email address) in the header of the file along the licence used. When Intellectual Property is an important concern, like for the Eclipse Foundation, it simply is *required* since we need to know who that work belongs to in the case of IP issues and legal lawsuite cases.
-
-The publication of open data in this context, i.e. with the original data being already publicly available from public tools, is a specific case of the GDPR and it is hard to find any reliable information about how it should be conducted. As a result we relied on similar studies and articles and proceeded on a best-effort basis to provide datasets to our users which are as useful and safe as possible.
-
-Considering that:
-
-* **Original data is already publicly available** through the tools themselves (Git, Bugzilla, Mailing lists and forums) and their APIs.
-* We provide a **complete description** of the content of the datasets, **identifying the risks** and **describing the mitigation steps** we went through to ensure that the data is safe.
-* To the best of our knowledge **there is now way to decrypt or reverse-engineer the obfuscated information**. The method used for anonymisation is so strong that only knowing the original data could help re-identifying it.
-
-Considering also that:
-* The goal of this processing is to provide **free and open resources to help scientific research**, which is in the **public interest** as defined in [Article 6.1 (e)](https://gdpr-info.eu/art-6-gdpr/).
-* The Eclipse forge hosts open source and collaborative projects only, and all contributions are made under a **required signed agreement** known as the [Eclipse Contributor Agreement](https://www.eclipse.org/legal/ECA.php): people explicitely and knowingly give their consent to make their contribution public.
-
-We assume that both the **data itself and its publication are safe**, regarding both the users and the current regulation.
-
-
-## References
-
-* [GDPR official text (HTML)](https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32016R0679&from=EN)
-* [GDPR official text (PDF)](https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32016R0679&from=EN)
-* [GDPR and the research process: What you need to know](https://blogs.openaire.eu/?p=3248)
-* [GDPR & Personal Data in the Public Domain](https://blog.infinigate.co.uk/gdpr-personal-data-public-domain)
-* [How to address privacy concerns when openin data](https://www.europeandataportal.eu/en/highlights/how-address-privacy-concerns-when-opening-data)
diff --git a/docs/scava-header.jpg b/docs/scava-header.jpg
deleted file mode 100644
index c06ab2b5b71588d4bf46dc76c07265428c09e631..0000000000000000000000000000000000000000
Binary files a/docs/scava-header.jpg and /dev/null differ
diff --git a/docs/scava.html b/docs/scava.html
deleted file mode 100644
index fd680784122a345e7cf026bba00e6c72c40b9fce..0000000000000000000000000000000000000000
--- a/docs/scava.html
+++ /dev/null
@@ -1,341 +0,0 @@
-<!DOCTYPE html><html><head>
-      <title>scava</title>
-      <meta charset="utf-8">
-      <meta name="viewport" content="width=device-width, initial-scale=1.0">
-      
-      <link rel="stylesheet" href="file:////home/boris/.atom/packages/markdown-preview-enhanced/node_modules/@shd101wyy/mume/dependencies/katex/katex.min.css">
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      <style>
-      /**
- * prism.js Github theme based on GitHub's theme.
- * @author Sam Clarke
- */
-code[class*="language-"],
-pre[class*="language-"] {
-  color: #333;
-  background: none;
-  font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace;
-  text-align: left;
-  white-space: pre;
-  word-spacing: normal;
-  word-break: normal;
-  word-wrap: normal;
-  line-height: 1.4;
-
-  -moz-tab-size: 8;
-  -o-tab-size: 8;
-  tab-size: 8;
-
-  -webkit-hyphens: none;
-  -moz-hyphens: none;
-  -ms-hyphens: none;
-  hyphens: none;
-}
-
-/* Code blocks */
-pre[class*="language-"] {
-  padding: .8em;
-  overflow: auto;
-  /* border: 1px solid #ddd; */
-  border-radius: 3px;
-  /* background: #fff; */
-  background: #f5f5f5;
-}
-
-/* Inline code */
-:not(pre) > code[class*="language-"] {
-  padding: .1em;
-  border-radius: .3em;
-  white-space: normal;
-  background: #f5f5f5;
-}
-
-.token.comment,
-.token.blockquote {
-  color: #969896;
-}
-
-.token.cdata {
-  color: #183691;
-}
-
-.token.doctype,
-.token.punctuation,
-.token.variable,
-.token.macro.property {
-  color: #333;
-}
-
-.token.operator,
-.token.important,
-.token.keyword,
-.token.rule,
-.token.builtin {
-  color: #a71d5d;
-}
-
-.token.string,
-.token.url,
-.token.regex,
-.token.attr-value {
-  color: #183691;
-}
-
-.token.property,
-.token.number,
-.token.boolean,
-.token.entity,
-.token.atrule,
-.token.constant,
-.token.symbol,
-.token.command,
-.token.code {
-  color: #0086b3;
-}
-
-.token.tag,
-.token.selector,
-.token.prolog {
-  color: #63a35c;
-}
-
-.token.function,
-.token.namespace,
-.token.pseudo-element,
-.token.class,
-.token.class-name,
-.token.pseudo-class,
-.token.id,
-.token.url-reference .token.variable,
-.token.attr-name {
-  color: #795da3;
-}
-
-.token.entity {
-  cursor: help;
-}
-
-.token.title,
-.token.title .token.punctuation {
-  font-weight: bold;
-  color: #1d3e81;
-}
-
-.token.list {
-  color: #ed6a43;
-}
-
-.token.inserted {
-  background-color: #eaffea;
-  color: #55a532;
-}
-
-.token.deleted {
-  background-color: #ffecec;
-  color: #bd2c00;
-}
-
-.token.bold {
-  font-weight: bold;
-}
-
-.token.italic {
-  font-style: italic;
-}
-
-
-/* JSON */
-.language-json .token.property {
-  color: #183691;
-}
-
-.language-markup .token.tag .token.punctuation {
-  color: #333;
-}
-
-/* CSS */
-code.language-css,
-.language-css .token.function {
-  color: #0086b3;
-}
-
-/* YAML */
-.language-yaml .token.atrule {
-  color: #63a35c;
-}
-
-code.language-yaml {
-  color: #183691;
-}
-
-/* Ruby */
-.language-ruby .token.function {
-  color: #333;
-}
-
-/* Markdown */
-.language-markdown .token.url {
-  color: #795da3;
-}
-
-/* Makefile */
-.language-makefile .token.symbol {
-  color: #795da3;
-}
-
-.language-makefile .token.variable {
-  color: #183691;
-}
-
-.language-makefile .token.builtin {
-  color: #0086b3;
-}
-
-/* Bash */
-.language-bash .token.keyword {
-  color: #0086b3;
-}
-
-/* highlight */
-pre[data-line] {
-  position: relative;
-  padding: 1em 0 1em 3em;
-}
-pre[data-line] .line-highlight-wrapper {
-  position: absolute;
-  top: 0;
-  left: 0;
-  background-color: transparent;
-  display: block;
-  width: 100%;
-}
-
-pre[data-line] .line-highlight {
-  position: absolute;
-  left: 0;
-  right: 0;
-  padding: inherit 0;
-  margin-top: 1em;
-  background: hsla(24, 20%, 50%,.08);
-  background: linear-gradient(to right, hsla(24, 20%, 50%,.1) 70%, hsla(24, 20%, 50%,0));
-  pointer-events: none;
-  line-height: inherit;
-  white-space: pre;
-}
-
-pre[data-line] .line-highlight:before, 
-pre[data-line] .line-highlight[data-end]:after {
-  content: attr(data-start);
-  position: absolute;
-  top: .4em;
-  left: .6em;
-  min-width: 1em;
-  padding: 0 .5em;
-  background-color: hsla(24, 20%, 50%,.4);
-  color: hsl(24, 20%, 95%);
-  font: bold 65%/1.5 sans-serif;
-  text-align: center;
-  vertical-align: .3em;
-  border-radius: 999px;
-  text-shadow: none;
-  box-shadow: 0 1px white;
-}
-
-pre[data-line] .line-highlight[data-end]:after {
-  content: attr(data-end);
-  top: auto;
-  bottom: .4em;
-}html body{font-family:"Helvetica Neue",Helvetica,"Segoe UI",Arial,freesans,sans-serif;font-size:16px;line-height:1.6;color:#333;background-color:#fff;overflow:initial;box-sizing:border-box;word-wrap:break-word}html body>:first-child{margin-top:0}html body h1,html body h2,html body h3,html body h4,html body h5,html body h6{line-height:1.2;margin-top:1em;margin-bottom:16px;color:#000}html body h1{font-size:2.25em;font-weight:300;padding-bottom:.3em}html body h2{font-size:1.75em;font-weight:400;padding-bottom:.3em}html body h3{font-size:1.5em;font-weight:500}html body h4{font-size:1.25em;font-weight:600}html body h5{font-size:1.1em;font-weight:600}html body h6{font-size:1em;font-weight:600}html body h1,html body h2,html body h3,html body h4,html body h5{font-weight:600}html body h5{font-size:1em}html body h6{color:#5c5c5c}html body strong{color:#000}html body del{color:#5c5c5c}html body a:not([href]){color:inherit;text-decoration:none}html body a{color:#08c;text-decoration:none}html body a:hover{color:#00a3f5;text-decoration:none}html body img{max-width:100%}html body>p{margin-top:0;margin-bottom:16px;word-wrap:break-word}html body>ul,html body>ol{margin-bottom:16px}html body ul,html body ol{padding-left:2em}html body ul.no-list,html body ol.no-list{padding:0;list-style-type:none}html body ul ul,html body ul ol,html body ol ol,html body ol ul{margin-top:0;margin-bottom:0}html body li{margin-bottom:0}html body li.task-list-item{list-style:none}html body li>p{margin-top:0;margin-bottom:0}html body .task-list-item-checkbox{margin:0 .2em .25em -1.8em;vertical-align:middle}html body .task-list-item-checkbox:hover{cursor:pointer}html body blockquote{margin:16px 0;font-size:inherit;padding:0 15px;color:#5c5c5c;background-color:#f0f0f0;border-left:4px solid #d6d6d6}html body blockquote>:first-child{margin-top:0}html body blockquote>:last-child{margin-bottom:0}html body hr{height:4px;margin:32px 0;background-color:#d6d6d6;border:0 none}html body table{margin:10px 0 15px 0;border-collapse:collapse;border-spacing:0;display:block;width:100%;overflow:auto;word-break:normal;word-break:keep-all}html body table th{font-weight:bold;color:#000}html body table td,html body table th{border:1px solid #d6d6d6;padding:6px 13px}html body dl{padding:0}html body dl dt{padding:0;margin-top:16px;font-size:1em;font-style:italic;font-weight:bold}html body dl dd{padding:0 16px;margin-bottom:16px}html body code{font-family:Menlo,Monaco,Consolas,'Courier New',monospace;font-size:.85em !important;color:#000;background-color:#f0f0f0;border-radius:3px;padding:.2em 0}html body code::before,html body code::after{letter-spacing:-0.2em;content:"\00a0"}html body pre>code{padding:0;margin:0;font-size:.85em !important;word-break:normal;white-space:pre;background:transparent;border:0}html body .highlight{margin-bottom:16px}html body .highlight pre,html body pre{padding:1em;overflow:auto;font-size:.85em !important;line-height:1.45;border:#d6d6d6;border-radius:3px}html body .highlight pre{margin-bottom:0;word-break:normal}html body pre code,html body pre tt{display:inline;max-width:initial;padding:0;margin:0;overflow:initial;line-height:inherit;word-wrap:normal;background-color:transparent;border:0}html body pre code:before,html body pre tt:before,html body pre code:after,html body pre tt:after{content:normal}html body p,html body blockquote,html body ul,html body ol,html body dl,html body pre{margin-top:0;margin-bottom:16px}html body kbd{color:#000;border:1px solid #d6d6d6;border-bottom:2px solid #c7c7c7;padding:2px 4px;background-color:#f0f0f0;border-radius:3px}@media print{html body{background-color:#fff}html body h1,html body h2,html body h3,html body h4,html body h5,html body h6{color:#000;page-break-after:avoid}html body blockquote{color:#5c5c5c}html body pre{page-break-inside:avoid}html body table{display:table}html body img{display:block;max-width:100%;max-height:100%}html body pre,html body code{word-wrap:break-word;white-space:pre}}.markdown-preview{width:100%;height:100%;box-sizing:border-box}.markdown-preview .pagebreak,.markdown-preview .newpage{page-break-before:always}.markdown-preview pre.line-numbers{position:relative;padding-left:3.8em;counter-reset:linenumber}.markdown-preview pre.line-numbers>code{position:relative}.markdown-preview pre.line-numbers .line-numbers-rows{position:absolute;pointer-events:none;top:1em;font-size:100%;left:0;width:3em;letter-spacing:-1px;border-right:1px solid #999;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none}.markdown-preview pre.line-numbers .line-numbers-rows>span{pointer-events:none;display:block;counter-increment:linenumber}.markdown-preview pre.line-numbers .line-numbers-rows>span:before{content:counter(linenumber);color:#999;display:block;padding-right:.8em;text-align:right}.markdown-preview .mathjax-exps .MathJax_Display{text-align:center !important}.markdown-preview:not([for="preview"]) .code-chunk .btn-group{display:none}.markdown-preview:not([for="preview"]) .code-chunk .status{display:none}.markdown-preview:not([for="preview"]) .code-chunk .output-div{margin-bottom:16px}.scrollbar-style::-webkit-scrollbar{width:8px}.scrollbar-style::-webkit-scrollbar-track{border-radius:10px;background-color:transparent}.scrollbar-style::-webkit-scrollbar-thumb{border-radius:5px;background-color:rgba(150,150,150,0.66);border:4px solid rgba(150,150,150,0.66);background-clip:content-box}html body[for="html-export"]:not([data-presentation-mode]){position:relative;width:100%;height:100%;top:0;left:0;margin:0;padding:0;overflow:auto}html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{position:relative;top:0}@media screen and (min-width:914px){html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{padding:2em calc(50% - 457px + 2em)}}@media screen and (max-width:914px){html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{padding:2em}}@media screen and (max-width:450px){html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{font-size:14px !important;padding:1em}}@media print{html body[for="html-export"]:not([data-presentation-mode]) #sidebar-toc-btn{display:none}}html body[for="html-export"]:not([data-presentation-mode]) #sidebar-toc-btn{position:fixed;bottom:8px;left:8px;font-size:28px;cursor:pointer;color:inherit;z-index:99;width:32px;text-align:center;opacity:.4}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] #sidebar-toc-btn{opacity:1}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc{position:fixed;top:0;left:0;width:300px;height:100%;padding:32px 0 48px 0;font-size:14px;box-shadow:0 0 4px rgba(150,150,150,0.33);box-sizing:border-box;overflow:auto;background-color:inherit}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc::-webkit-scrollbar{width:8px}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc::-webkit-scrollbar-track{border-radius:10px;background-color:transparent}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc::-webkit-scrollbar-thumb{border-radius:5px;background-color:rgba(150,150,150,0.66);border:4px solid rgba(150,150,150,0.66);background-clip:content-box}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc a{text-decoration:none}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc ul{padding:0 1.6em;margin-top:.8em}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc li{margin-bottom:.8em}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc ul{list-style-type:none}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .markdown-preview{left:300px;width:calc(100% -  300px);padding:2em calc(50% - 457px -  150px);margin:0;box-sizing:border-box}@media screen and (max-width:1274px){html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .markdown-preview{padding:2em}}@media screen and (max-width:450px){html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .markdown-preview{width:100%}}html body[for="html-export"]:not([data-presentation-mode]):not([html-show-sidebar-toc]) .markdown-preview{left:50%;transform:translateX(-50%)}html body[for="html-export"]:not([data-presentation-mode]):not([html-show-sidebar-toc]) .md-sidebar-toc{display:none}
-/* Please visit the URL below for more information: */
-/*   https://shd101wyy.github.io/markdown-preview-enhanced/#/customize-css */
-
-      </style>
-    </head>
-    <body for="html-export">
-      <div class="mume markdown-preview  ">
-      <h1 class="mume-header" id="eclipse-scava-downloads">Eclipse Scava Downloads</h1>
-
-<p><img src="scava-header.jpg" alt="Scava logo"></p>
-<p>This web site hosts the open datasets generated in the course of the <a href="https://crossminer.org">Crossminer research project</a>. Crossminer has been terminated in 2019, and since then the datasets are maintained by <a href="https://castalia.solutions">Castalia Solutions</a> as a service for the Eclipse and Research communities.</p>
-<p>The datasets include various pieces of data retrieved from the Eclipse forge: Mailing lists, Project development data, and AERI stacktraces in handy CSV and JSON formats. Each dataset has a R Markdown document describing its content and providing hints about how to use it. Examples provided mainly use the <a href="https://r-project.org">R statistical analysis software</a>.</p>
-<p>All datasets are published under the <a href="https://creativecommons.org/licenses/by-sa/4.0/">Creative Commons BY-Attribution-Share Alike 4.0 (International)</a>.</p>
-<p>All data is anonymised, please see the <a href="datasets_privacy.html">dedicated document</a> to learn more about privacy and the anonymisation mecanism.</p>
-<p>We&apos;re open: if you&apos;d like to contribute, or for any request or question, please see the <a href="https://gitlab.eclipse.org/bbaldassari2kd/scava-datasets">Eclipse GitLab project</a> page.</p>
-<h2 class="mume-header" id="eclipse-projects">Eclipse projects</h2>
-
-<p>We generate comprehensive data extracts of a <a href="projects/eclipse_projects.html">set of Eclipse projects</a>, including data sources like:</p>
-<ul>
-<li>Software Configuration Management (Eclipse <a href="https://git.eclipse.org">git</a> or <a href="https://github.com">GitHub</a>),</li>
-<li>Issues tracking (<a href="https://bugs.eclipse.org">Bugzilla</a> or <a href="https://github.com">GitHub</a>),</li>
-<li>Project metadata (PMI) checks (<a href="https://projects.eclipse.org">PMI</a>),</li>
-<li>Licencing and copyrights (<a href="https://github.com/nexB/scancode-toolkit">Scancode</a>), and</li>
-<li>Static Code Analysis (<a href="https://sonarcloud.io">SonarCloud</a>) when available.</li>
-</ul>
-<p>These datasets are updated weekly, at 2am on Sunday. If you would like to add a project, please <a href="https://gitlab.eclipse.org/bbaldassari2kd/scava-datasets/-/issues">submit an issue</a>.</p>
-<p><strong>Downloads</strong></p>
-<ul>
-<li><strong>List of projects</strong> See the <a href="projects/eclipse_projects.html">list of projects with their associated datasets and documentation</a>.</li>
-</ul>
-<h2 class="mume-header" id="eclipse-mailing-lists">Eclipse mailing lists</h2>
-
-<p>The <a href="eclipse_mls/eclipse_mls.html">Eclipse Mailing lists</a> dump is an extract of all emails posted on the Eclipse mailing lists.</p>
-<ul>
-<li>Download the <strong>Eclipse mailing lists dataset</strong> [ <a href="eclipse_mls/eclipse_mls.gz">CSV</a> ].</li>
-<li>Check the <strong>documentation</strong> for the dataset <a href="eclipse_mls/mbox_csv_analysis.html">here (HTML)</a>. For reproducibility we also provide the <a href="eclipse_mls/mbox_csv_analysis.rmd">R Markdown document</a> for the dataset analysis and documentation.</li>
-<li>Download the <strong>mbox files</strong> [ <a href="eclipse_mls/eclipse_mls.html#project-mboxes">see the list</a> ]</li>
-</ul>
-<p>More information can be found on the official <a href="https://accounts.eclipse.org/mailing-list">Eclipse page for mailing lists</a>.</p>
-<h2 class="mume-header" id="aeri-stacktraces">AERI Stacktraces</h2>
-
-<p>The <a href="aeri_stacktraces/aeri_stacktraces.html">AERI stacktraces dataset</a> is a list of exceptions encountered by users in the Eclipse IDE, as retrieved by the AERI system. The Automated Error Reporting (AERI) system has been developed by the people at <a href="https://www.codetrails.com/">Code Trails</a> and retrieves information about exceptions. It is installed by default in the Eclipse IDE and has helped hundreds of projects better support their users and resolve bugs. This dataset is a dump of all records over a couple of years, with useful information about the exceptions and environment.</p>
-<p>Last update of the dataset occured on 2018-02-11.</p>
-<p><strong>Downloads</strong></p>
-<ul>
-<li><strong>Problems full</strong> [ <a href="aeri_stacktraces/problems_full.tar.bz2">Download JSON</a> ] -- A list of all problems, exported as JSON (one problem per file).</li>
-<li><strong>Problems extract</strong> [ <a href="aeri_stacktraces/problems_extract.csv.bz2">Download CSV</a> ] -- A list of all problems, exported as CSV (one big file).</li>
-<li><strong>Incidents full</strong> [ <a href="aeri_stacktraces/incidents_full.tar.bz2">Download JSON</a> ] -- A list of all incidents, exported as JSON (one incident per file).</li>
-<li><strong>Incidents extract</strong> [ <a href="aeri_stacktraces/incidents_extract.csv.bz2">Download CSV</a> ] -- A list of all incidents, exported as CSV (one big file).</li>
-<li><strong>Incidents Bundles</strong> [ <a href="aeri_stacktraces/incidents_bundles_extract.csv.bz2">Download CSV</a> ] -- A list of all bundles found in incidents, exported as CSV. Attributes are bundle_name, bundle_version, and number of occurrences.</li>
-</ul>
-<p><strong>Documentation</strong></p>
-<ul>
-<li><strong>Stacktraces Problems analysis document</strong> [ <a href="aeri_stacktraces/problems_analysis.pdf">Download PDF</a> | <a href="aeri_stacktraces/problems_analysis.rmd">Download Rmd</a> ] -- A R Markdown document to analyse the Stacktraces problem dataset, with description of the actual content and examples of usage.</li>
-<li><strong>Stacktraces Incidents analysis document</strong> [ <a href="aeri_stacktraces/incidents_analysis.pdf">Download PDF</a> | <a href="aeri_stacktraces/incidents_analysis.rmd">Download Rmd</a> ] -- A R Markdown document to analyse the Stacktraces incidents dataset, with description of the actual content and examples of usage.</li>
-</ul>
-<p>More information about the AERI system can be found on the <a href="https://www.codetrails.com/error-analytics/manual/">Code Trails website</a>.</p>
-<h2 class="mume-header" id="about-scava">About Scava</h2>
-
-<p>Scava is the Eclipse spin-off of Crossminer, a EU-funded research project. More information can be found at the following places:</p>
-<ul>
-<li>The <a href="https://eclipse.org/scava">Eclipse Scava project</a></li>
-<li>The official <a href="https://scava-docs.readthedocs.io">documentation for Scava</a></li>
-<li>The <a href="https://github.com/crossminer/scava-docs">documentation repository</a></li>
-<li>The official <a href="https://crossminer.org">Crossminer web page</a></li>
-<li>The <a href="https://github.com/crossminer">GitHub Crossminer organisation</a></li>
-</ul>
-<h2 class="mume-header" id="licencing">Licencing</h2>
-
-<p>All datasets are published under the <a href="https://creativecommons.org/licenses/by-sa/4.0/">Creative Commons BY-Attribution-Share Alike 4.0 (International)</a>.</p>
-<p>All code is, unless otherwise stated, published under the <a href="https://www.eclipse.org/legal/epl-2.0/">Eclipse Public Licence, v2</a>.</p>
-
-      </div>
-      
-      
-    
-    
-    
-    
-    
-    
-    
-    
-  
-    </body></html>
\ No newline at end of file
diff --git a/docs/scava.md b/docs/scava.md
deleted file mode 100644
index 02f2a54e52b99e66acb855395a6037dec6dba0df..0000000000000000000000000000000000000000
--- a/docs/scava.md
+++ /dev/null
@@ -1,81 +0,0 @@
-
-# Eclipse Scava Downloads
-
-![Scava logo](scava-header.jpg)
-
-
-This web site hosts the open datasets generated in the course of the [Crossminer research project](https://crossminer.org). Crossminer has been terminated in 2019, and since then the datasets are maintained by [Castalia Solutions](https://castalia.solutions) as a service for the Eclipse and Research communities.
-
-The datasets include various pieces of data retrieved from the Eclipse forge: Mailing lists, Project development data, and AERI stacktraces in handy CSV and JSON formats. Each dataset has a R Markdown document describing its content and providing hints about how to use it. Examples provided mainly use the [R statistical analysis software](https://r-project.org).
-
-All datasets are published under the [Creative Commons BY-Attribution-Share Alike 4.0 (International)](https://creativecommons.org/licenses/by-sa/4.0/).
-
-All data is anonymised, please see the [dedicated document](datasets_privacy.html) to learn more about privacy and the anonymisation mecanism.
-
-We're open: if you'd like to contribute, or for any request or question, please see the [Eclipse GitLab project](https://gitlab.eclipse.org/bbaldassari2kd/scava-datasets) page.
-
-
-## Eclipse projects
-
-We generate comprehensive data extracts of a [set of Eclipse projects](projects/eclipse_projects.html), including data sources like:
-
-* Software Configuration Management (Eclipse [git](https://git.eclipse.org) or [GitHub](https://github.com)),
-* Issues tracking ([Bugzilla](https://bugs.eclipse.org) or [GitHub](https://github.com)),
-* Project metadata (PMI) checks ([PMI](https://projects.eclipse.org)),
-* Licencing and copyrights ([Scancode](https://github.com/nexB/scancode-toolkit)), and
-* Static Code Analysis ([SonarCloud](https://sonarcloud.io)) when available.
-
-These datasets are updated weekly, at 2am on Sunday. If you would like to add a project, please [submit an issue](https://gitlab.eclipse.org/bbaldassari2kd/scava-datasets/-/issues).
-
-**Downloads**
-
-* **List of projects** See the [list of projects with their associated datasets and documentation](projects/eclipse_projects.html).
-
-
-## Eclipse mailing lists
-
-The [Eclipse Mailing lists](eclipse_mls/eclipse_mls.html) dump is an extract of all emails posted on the Eclipse mailing lists.
-
-* Download the **Eclipse mailing lists dataset** [ [CSV](eclipse_mls/eclipse_mls.gz) ].
-* Check the **documentation** for the dataset [here (HTML)](eclipse_mls/mbox_csv_analysis.html). For reproducibility we also provide the [R Markdown document](eclipse_mls/mbox_csv_analysis.rmd) for the dataset analysis and documentation.
-* Download the **mbox files** [ [see the list](eclipse_mls/eclipse_mls.html#project-mboxes) ]
-
-More information can be found on the official [Eclipse page for mailing lists](https://accounts.eclipse.org/mailing-list).
-
-
-## AERI Stacktraces
-
-The [AERI stacktraces dataset](aeri_stacktraces/aeri_stacktraces.html) is a list of exceptions encountered by users in the Eclipse IDE, as retrieved by the AERI system. The Automated Error Reporting (AERI) system has been developed by the people at [Code Trails](https://www.codetrails.com/) and retrieves information about exceptions. It is installed by default in the Eclipse IDE and has helped hundreds of projects better support their users and resolve bugs. This dataset is a dump of all records over a couple of years, with useful information about the exceptions and environment.
-
-Last update of the dataset occured on 2018-02-11.
-
-**Downloads**
-
-* **Problems full** [ [Download JSON](aeri_stacktraces/problems_full.tar.bz2) ] -- A list of all problems, exported as JSON (one problem per file).
-* **Problems extract** [ [Download CSV](aeri_stacktraces/problems_extract.csv.bz2) ] -- A list of all problems, exported as CSV (one big file).
-* **Incidents full** [ [Download JSON](aeri_stacktraces/incidents_full.tar.bz2) ] -- A list of all incidents, exported as JSON (one incident per file).
-* **Incidents extract** [ [Download CSV](aeri_stacktraces/incidents_extract.csv.bz2) ] -- A list of all incidents, exported as CSV (one big file).
-* **Incidents Bundles** [ [Download CSV](aeri_stacktraces/incidents_bundles_extract.csv.bz2) ] -- A list of all bundles found in incidents, exported as CSV. Attributes are bundle_name, bundle_version, and number of occurrences.
-
-**Documentation**
-
-* **Stacktraces Problems analysis document** [ [Download PDF](aeri_stacktraces/problems_analysis.pdf) | [Download Rmd](aeri_stacktraces/problems_analysis.rmd) ] -- A R Markdown document to analyse the Stacktraces problem dataset, with description of the actual content and examples of usage.
-* **Stacktraces Incidents analysis document** [ [Download PDF](aeri_stacktraces/incidents_analysis.pdf) | [Download Rmd](aeri_stacktraces/incidents_analysis.rmd) ] -- A R Markdown document to analyse the Stacktraces incidents dataset, with description of the actual content and examples of usage.
-
-More information about the AERI system can be found on the [Code Trails website](https://www.codetrails.com/error-analytics/manual/).
-
-## About Scava
-
-Scava is the Eclipse spin-off of Crossminer, a EU-funded research project. More information can be found at the following places:
-
-* The [Eclipse Scava project](https://eclipse.org/scava)
-* The official [documentation for Scava](https://scava-docs.readthedocs.io)
-* The [documentation repository](https://github.com/crossminer/scava-docs)
-* The official [Crossminer web page](https://crossminer.org)
-* The [GitHub Crossminer organisation](https://github.com/crossminer)
-
-## Licencing
-
-All datasets are published under the [Creative Commons BY-Attribution-Share Alike 4.0 (International)](https://creativecommons.org/licenses/by-sa/4.0/).
-
-All code is, unless otherwise stated, published under the [Eclipse Public Licence, v2](https://www.eclipse.org/legal/epl-2.0/).