Skip to content
Snippets Groups Projects
Commit ef8fa0f3 authored by Boris Baldassari's avatar Boris Baldassari
Browse files

#5 Refactor: delete datasets directory.


Signed-off-by: default avatarBoris Baldassari <boris@chrysalice.org>
parent 378a1003
No related branches found
No related tags found
No related merge requests found
Showing
with 0 additions and 6400 deletions
<!DOCTYPE html><html><head>
<title>aeri_stacktraces</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="stylesheet" href="file:////home/boris/.atom/packages/markdown-preview-enhanced/node_modules/@shd101wyy/mume/dependencies/katex/katex.min.css">
<style>
/**
* prism.js Github theme based on GitHub's theme.
* @author Sam Clarke
*/
code[class*="language-"],
pre[class*="language-"] {
color: #333;
background: none;
font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace;
text-align: left;
white-space: pre;
word-spacing: normal;
word-break: normal;
word-wrap: normal;
line-height: 1.4;
-moz-tab-size: 8;
-o-tab-size: 8;
tab-size: 8;
-webkit-hyphens: none;
-moz-hyphens: none;
-ms-hyphens: none;
hyphens: none;
}
/* Code blocks */
pre[class*="language-"] {
padding: .8em;
overflow: auto;
/* border: 1px solid #ddd; */
border-radius: 3px;
/* background: #fff; */
background: #f5f5f5;
}
/* Inline code */
:not(pre) > code[class*="language-"] {
padding: .1em;
border-radius: .3em;
white-space: normal;
background: #f5f5f5;
}
.token.comment,
.token.blockquote {
color: #969896;
}
.token.cdata {
color: #183691;
}
.token.doctype,
.token.punctuation,
.token.variable,
.token.macro.property {
color: #333;
}
.token.operator,
.token.important,
.token.keyword,
.token.rule,
.token.builtin {
color: #a71d5d;
}
.token.string,
.token.url,
.token.regex,
.token.attr-value {
color: #183691;
}
.token.property,
.token.number,
.token.boolean,
.token.entity,
.token.atrule,
.token.constant,
.token.symbol,
.token.command,
.token.code {
color: #0086b3;
}
.token.tag,
.token.selector,
.token.prolog {
color: #63a35c;
}
.token.function,
.token.namespace,
.token.pseudo-element,
.token.class,
.token.class-name,
.token.pseudo-class,
.token.id,
.token.url-reference .token.variable,
.token.attr-name {
color: #795da3;
}
.token.entity {
cursor: help;
}
.token.title,
.token.title .token.punctuation {
font-weight: bold;
color: #1d3e81;
}
.token.list {
color: #ed6a43;
}
.token.inserted {
background-color: #eaffea;
color: #55a532;
}
.token.deleted {
background-color: #ffecec;
color: #bd2c00;
}
.token.bold {
font-weight: bold;
}
.token.italic {
font-style: italic;
}
/* JSON */
.language-json .token.property {
color: #183691;
}
.language-markup .token.tag .token.punctuation {
color: #333;
}
/* CSS */
code.language-css,
.language-css .token.function {
color: #0086b3;
}
/* YAML */
.language-yaml .token.atrule {
color: #63a35c;
}
code.language-yaml {
color: #183691;
}
/* Ruby */
.language-ruby .token.function {
color: #333;
}
/* Markdown */
.language-markdown .token.url {
color: #795da3;
}
/* Makefile */
.language-makefile .token.symbol {
color: #795da3;
}
.language-makefile .token.variable {
color: #183691;
}
.language-makefile .token.builtin {
color: #0086b3;
}
/* Bash */
.language-bash .token.keyword {
color: #0086b3;
}
/* highlight */
pre[data-line] {
position: relative;
padding: 1em 0 1em 3em;
}
pre[data-line] .line-highlight-wrapper {
position: absolute;
top: 0;
left: 0;
background-color: transparent;
display: block;
width: 100%;
}
pre[data-line] .line-highlight {
position: absolute;
left: 0;
right: 0;
padding: inherit 0;
margin-top: 1em;
background: hsla(24, 20%, 50%,.08);
background: linear-gradient(to right, hsla(24, 20%, 50%,.1) 70%, hsla(24, 20%, 50%,0));
pointer-events: none;
line-height: inherit;
white-space: pre;
}
pre[data-line] .line-highlight:before,
pre[data-line] .line-highlight[data-end]:after {
content: attr(data-start);
position: absolute;
top: .4em;
left: .6em;
min-width: 1em;
padding: 0 .5em;
background-color: hsla(24, 20%, 50%,.4);
color: hsl(24, 20%, 95%);
font: bold 65%/1.5 sans-serif;
text-align: center;
vertical-align: .3em;
border-radius: 999px;
text-shadow: none;
box-shadow: 0 1px white;
}
pre[data-line] .line-highlight[data-end]:after {
content: attr(data-end);
top: auto;
bottom: .4em;
}html body{font-family:"Helvetica Neue",Helvetica,"Segoe UI",Arial,freesans,sans-serif;font-size:16px;line-height:1.6;color:#333;background-color:#fff;overflow:initial;box-sizing:border-box;word-wrap:break-word}html body>:first-child{margin-top:0}html body h1,html body h2,html body h3,html body h4,html body h5,html body h6{line-height:1.2;margin-top:1em;margin-bottom:16px;color:#000}html body h1{font-size:2.25em;font-weight:300;padding-bottom:.3em}html body h2{font-size:1.75em;font-weight:400;padding-bottom:.3em}html body h3{font-size:1.5em;font-weight:500}html body h4{font-size:1.25em;font-weight:600}html body h5{font-size:1.1em;font-weight:600}html body h6{font-size:1em;font-weight:600}html body h1,html body h2,html body h3,html body h4,html body h5{font-weight:600}html body h5{font-size:1em}html body h6{color:#5c5c5c}html body strong{color:#000}html body del{color:#5c5c5c}html body a:not([href]){color:inherit;text-decoration:none}html body a{color:#08c;text-decoration:none}html body a:hover{color:#00a3f5;text-decoration:none}html body img{max-width:100%}html body>p{margin-top:0;margin-bottom:16px;word-wrap:break-word}html body>ul,html body>ol{margin-bottom:16px}html body ul,html body ol{padding-left:2em}html body ul.no-list,html body ol.no-list{padding:0;list-style-type:none}html body ul ul,html body ul ol,html body ol ol,html body ol ul{margin-top:0;margin-bottom:0}html body li{margin-bottom:0}html body li.task-list-item{list-style:none}html body li>p{margin-top:0;margin-bottom:0}html body .task-list-item-checkbox{margin:0 .2em .25em -1.8em;vertical-align:middle}html body .task-list-item-checkbox:hover{cursor:pointer}html body blockquote{margin:16px 0;font-size:inherit;padding:0 15px;color:#5c5c5c;border-left:4px solid #d6d6d6}html body blockquote>:first-child{margin-top:0}html body blockquote>:last-child{margin-bottom:0}html body hr{height:4px;margin:32px 0;background-color:#d6d6d6;border:0 none}html body table{margin:10px 0 15px 0;border-collapse:collapse;border-spacing:0;display:block;width:100%;overflow:auto;word-break:normal;word-break:keep-all}html body table th{font-weight:bold;color:#000}html body table td,html body table th{border:1px solid #d6d6d6;padding:6px 13px}html body dl{padding:0}html body dl dt{padding:0;margin-top:16px;font-size:1em;font-style:italic;font-weight:bold}html body dl dd{padding:0 16px;margin-bottom:16px}html body code{font-family:Menlo,Monaco,Consolas,'Courier New',monospace;font-size:.85em !important;color:#000;background-color:#f0f0f0;border-radius:3px;padding:.2em 0}html body code::before,html body code::after{letter-spacing:-0.2em;content:"\00a0"}html body pre>code{padding:0;margin:0;font-size:.85em !important;word-break:normal;white-space:pre;background:transparent;border:0}html body .highlight{margin-bottom:16px}html body .highlight pre,html body pre{padding:1em;overflow:auto;font-size:.85em !important;line-height:1.45;border:#d6d6d6;border-radius:3px}html body .highlight pre{margin-bottom:0;word-break:normal}html body pre code,html body pre tt{display:inline;max-width:initial;padding:0;margin:0;overflow:initial;line-height:inherit;word-wrap:normal;background-color:transparent;border:0}html body pre code:before,html body pre tt:before,html body pre code:after,html body pre tt:after{content:normal}html body p,html body blockquote,html body ul,html body ol,html body dl,html body pre{margin-top:0;margin-bottom:16px}html body kbd{color:#000;border:1px solid #d6d6d6;border-bottom:2px solid #c7c7c7;padding:2px 4px;background-color:#f0f0f0;border-radius:3px}@media print{html body{background-color:#fff}html body h1,html body h2,html body h3,html body h4,html body h5,html body h6{color:#000;page-break-after:avoid}html body blockquote{color:#5c5c5c}html body pre{page-break-inside:avoid}html body table{display:table}html body img{display:block;max-width:100%;max-height:100%}html body pre,html body code{word-wrap:break-word;white-space:pre}}.markdown-preview{width:100%;height:100%;box-sizing:border-box}.markdown-preview .pagebreak,.markdown-preview .newpage{page-break-before:always}.markdown-preview pre.line-numbers{position:relative;padding-left:3.8em;counter-reset:linenumber}.markdown-preview pre.line-numbers>code{position:relative}.markdown-preview pre.line-numbers .line-numbers-rows{position:absolute;pointer-events:none;top:1em;font-size:100%;left:0;width:3em;letter-spacing:-1px;border-right:1px solid #999;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none}.markdown-preview pre.line-numbers .line-numbers-rows>span{pointer-events:none;display:block;counter-increment:linenumber}.markdown-preview pre.line-numbers .line-numbers-rows>span:before{content:counter(linenumber);color:#999;display:block;padding-right:.8em;text-align:right}.markdown-preview .mathjax-exps .MathJax_Display{text-align:center !important}.markdown-preview:not([for="preview"]) .code-chunk .btn-group{display:none}.markdown-preview:not([for="preview"]) .code-chunk .status{display:none}.markdown-preview:not([for="preview"]) .code-chunk .output-div{margin-bottom:16px}.scrollbar-style::-webkit-scrollbar{width:8px}.scrollbar-style::-webkit-scrollbar-track{border-radius:10px;background-color:transparent}.scrollbar-style::-webkit-scrollbar-thumb{border-radius:5px;background-color:rgba(150,150,150,0.66);border:4px solid rgba(150,150,150,0.66);background-clip:content-box}html body[for="html-export"]:not([data-presentation-mode]){position:relative;width:100%;height:100%;top:0;left:0;margin:0;padding:0;overflow:auto}html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{position:relative;top:0}@media screen and (min-width:914px){html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{padding:2em calc(50% - 457px)}}@media screen and (max-width:914px){html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{padding:2em}}@media screen and (max-width:450px){html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{font-size:14px !important;padding:1em}}@media print{html body[for="html-export"]:not([data-presentation-mode]) #sidebar-toc-btn{display:none}}html body[for="html-export"]:not([data-presentation-mode]) #sidebar-toc-btn{position:fixed;bottom:8px;left:8px;font-size:28px;cursor:pointer;color:inherit;z-index:99;width:32px;text-align:center;opacity:.4}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] #sidebar-toc-btn{opacity:1}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc{position:fixed;top:0;left:0;width:300px;height:100%;padding:32px 0 48px 0;font-size:14px;box-shadow:0 0 4px rgba(150,150,150,0.33);box-sizing:border-box;overflow:auto;background-color:inherit}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc::-webkit-scrollbar{width:8px}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc::-webkit-scrollbar-track{border-radius:10px;background-color:transparent}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc::-webkit-scrollbar-thumb{border-radius:5px;background-color:rgba(150,150,150,0.66);border:4px solid rgba(150,150,150,0.66);background-clip:content-box}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc a{text-decoration:none}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc ul{padding:0 1.6em;margin-top:.8em}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc li{margin-bottom:.8em}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc ul{list-style-type:none}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .markdown-preview{left:300px;width:calc(100% - 300px);padding:2em calc(50% - 457px - 150px);margin:0;box-sizing:border-box}@media screen and (max-width:1274px){html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .markdown-preview{padding:2em}}@media screen and (max-width:450px){html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .markdown-preview{width:100%}}html body[for="html-export"]:not([data-presentation-mode]):not([html-show-sidebar-toc]) .markdown-preview{left:50%;transform:translateX(-50%)}html body[for="html-export"]:not([data-presentation-mode]):not([html-show-sidebar-toc]) .md-sidebar-toc{display:none}
/* Please visit the URL below for more information: */
/* https://shd101wyy.github.io/markdown-preview-enhanced/#/customize-css */
</style>
</head>
<body for="html-export">
<div class="mume markdown-preview ">
<h1 class="mume-header" id="the-aeri-stacktraces-dataset">The AERI Stacktraces dataset</h1>
<h2 class="mume-header" id="presentation">Presentation</h2>
<p>The <a href="https://wiki.eclipse.org/EPP/Logging">Automated Error Reporting</a> (AERI) system retrieves <a href="https://www.codetrails.com/error-analytics/manual/">information about exceptions</a>. It is installed by default in the <a href="http://www.eclipse.org/ide/">Eclipse IDE</a> and has helped hundreds of projects better support their users and resolve bugs.</p>
<p>This dataset is a dump of all records over a couple of years, with useful information about the exceptions and environment. It is composed of:</p>
<ul>
<li><strong>Incidents</strong> When an exception occurs and is trapped by the AERI system, it constitutes an incident (or error report). An incident can be reported by several different people, can be reported multiple times, and can be linked to different environments.</li>
<li><strong>Problems</strong> As soon as an error report arrives on the server, it will be analyzed and subsequently assigned to one or more problems. A problem thus represents a set of (similar) error reports which usually have the same root cause &#x2013; for example a bug in your software. (Extract from the <a href="https://www.codetrails.com/error-analytics/manual/concepts/error-reports-problems-bugs-projects.html">AERI system documentation</a>)</li>
</ul>
<p>This dataset is published under the <a href="https://creativecommons.org/licenses/by-sa/4.0/">Creative Commons BY-Attribution-Share Alike 4.0 (International) licence</a>.</p>
<h2 class="mume-header" id="downloads">Downloads</h2>
<ul>
<li><strong>Problems full</strong> [ <a href="problems_full.tar.bz2">Download JSON</a> ] -- A list of all problems, exported as JSON (one problem per file).
<ul>
<li>Content: 125250 entries, 22 attributes</li>
<li>Size: 38M compressed, 904M raw</li>
</ul>
</li>
<li><strong>Problems extract</strong> [ <a href="problems_extract.csv.bz2">Download CSV</a> ] -- A list of all problems, exported as CSV (one big file).
<ul>
<li>Content: 125250 entries, 22 attributes</li>
<li>Size: 1.5M compressed, 14M raw</li>
</ul>
</li>
<li><strong>Incidents full</strong> [ <a href="incidents_full.tar.bz2">Download JSON</a> ] -- A list of all incidents, exported as JSON (one incident per file).
<ul>
<li>Content: 2084363 entries, 22 attributes</li>
<li>Size: 820M compressed, 19G raw</li>
</ul>
</li>
<li><strong>Incidents extract</strong> [ <a href="incidents_extract.csv.bz2">Download CSV</a> ] -- A list of all incidents, exported as CSV (one big file).
<ul>
<li>Content: 2084045 entries, 20 attributes</li>
<li>Size: 141M compressed, 778M raw</li>
</ul>
</li>
<li><strong>Incidents Bundles</strong> [ <a href="incidents_bundles_extract.csv.bz2">Download CSV</a> ] -- A list of all bundles found in incidents, exported as CSV. Attributes are bundle_name, bundle_version, and number of occurrences.
<ul>
<li>Content: 29709 entries, 3 attributes</li>
<li>Size: 220K compressed, 1.5M raw</li>
</ul>
</li>
</ul>
<p><strong>Documentation</strong></p>
<ul>
<li><strong>Stacktraces Problems analysis document</strong> [ <a href="problems_analysis.pdf">Download PDF</a> | <a href="problems_analysis.rmd">Download Rmd</a> ] -- A R Markdown document to analyse the Stacktraces problem dataset, with description of the actual content and examples of usage.</li>
<li><strong>Stacktraces Incidents analysis document</strong> [ <a href="incidents_analysis.pdf">Download PDF</a> | <a href="incidents_analysis.rmd">Download Rmd</a> ] -- A R Markdown document to analyse the Stacktraces incidents dataset, with description of the actual content and examples of usage.</li>
</ul>
<p>More information about the AERI system can be found on the <a href="https://www.codetrails.com/error-analytics/manual/">Code Trails website</a>.</p>
<h2 class="mume-header" id="privacy-concerns">Privacy concerns</h2>
<p>The result contains no email address, user id or machine id. Rather than removing the information (we are not sure that we remove all required information) we decided to simply pick relevant information from the file and push it into the output.</p>
<p>End users have an option to keep their own class names private. We have presently no simple means to know what stacktraces in the database extraction should be kept private, so we decided to play it safe and hide class names whose packages don&apos;t start with known prefixes [1]. All private classnames have been replaced by the HIDDEN keyword.</p>
<p>[1] <code>&quot;ch.qos.*&quot;, &quot;com.cforcoding.*&quot;, &quot;com.google.*&quot;, &quot;com.gradleware.tooling.*&quot;, &quot;com.mountainminds.eclemma.*&quot;, &quot;com.naef.*&quot;, &quot;com.sun.*&quot;, &quot;java.*&quot;, &quot;javafx.*&quot;, &quot;javax.*&quot;, &quot;org.apache.*&quot;, &quot;org.eclipse.*&quot;, &quot;org.fordiac.*&quot;, &quot;org.gradle.*&quot;, &quot;org.jacoco.*&quot;, &quot;org.osgi.*&quot;, &quot;org.slf4j.*&quot;, &quot;sun.*&quot;</code></p>
<h2 class="mume-header" id="format-problems">Format: problems</h2>
<pre class="language-text">{
&quot;summary&quot;: &quot;&quot;,
&quot;osgiArch&quot;: &quot;&quot;,
&quot;osgiOs&quot;: &quot;&quot;,
&quot;osgiOsVersion&quot;: &quot;&quot;,
&quot;osgiWs&quot;: &quot;&quot;,
&quot;eclipseBuildId&quot;: &quot;&quot;,
&quot;eclipseProduct&quot;: &quot;&quot;,
&quot;javaRuntimeVersion&quot;: &quot;&quot;,
&quot;numberOfIncidents&quot;: 0,
&quot;numberOfReporters&quot;: 74,
&quot;stacktraces&quot;: [
[ &quot;stacktrace for incident&quot; ],
[ &quot;stacktrace for cause&quot; ],
[ &quot;stacktrace for exception&quot; ]
]
}
</pre>
<h2 class="mume-header" id="format-incidents">Format: incidents</h2>
<pre class="language-text">{
&quot;eclipseBuildId&quot;:&quot;4.6.1.M20160907-1200&quot;,
&quot;eclipseProduct&quot;:&quot;org.eclipse.epp.package.jee.product&quot;,
&quot;javaRuntimeVersion&quot;:&quot;1.8.0_112-b15&quot;,
&quot;osgiArch&quot;:&quot;x86_64&quot;,
&quot;osgiOs&quot;:&quot;Windows7&quot;,
&quot;osgiOsVersion&quot;:&quot;6.1.0&quot;,
&quot;osgiWs&quot;:&quot;win32&quot;,
&quot;stacktraces&quot;:[
[ &quot;stacktrace&quot; ]
],
&quot;summary&quot;: &quot;Failed to retrieve default libraries for jre1.8.0_111&quot;
}
</pre>
<h2 class="mume-header" id="format-stacktraces">Format: Stacktraces</h2>
<p>The structure used in the mongodb for stacktraces has been kept as is: it is composed of fields with all information relevant to each line of the stacktrace. Each stacktrace is an array of objects as shown below:</p>
<pre class="language-text">[
{
&quot;cN&quot;: &quot;sun.net.www.http.HttpClient&quot;,
&quot;mN&quot;: &quot;parseHTTPHeader&quot;,
&quot;fN&quot;: &quot;HttpClient.java&quot;,
&quot;lN&quot;: 786,
}
]
</pre>
</div>
</body></html>
\ No newline at end of file
# The AERI Stacktraces dataset
## Presentation
The [Automated Error Reporting](https://wiki.eclipse.org/EPP/Logging) (AERI) system retrieves [information about exceptions](https://www.codetrails.com/error-analytics/manual/). It is installed by default in the [Eclipse IDE](http://www.eclipse.org/ide/) and has helped hundreds of projects better support their users and resolve bugs.
This dataset is a dump of all records over a couple of years, with useful information about the exceptions and environment. It is composed of:
* **Incidents** When an exception occurs and is trapped by the AERI system, it constitutes an incident (or error report). An incident can be reported by several different people, can be reported multiple times, and can be linked to different environments.
* **Problems** As soon as an error report arrives on the server, it will be analyzed and subsequently assigned to one or more problems. A problem thus represents a set of (similar) error reports which usually have the same root cause – for example a bug in your software. (Extract from the [AERI system documentation](https://www.codetrails.com/error-analytics/manual/concepts/error-reports-problems-bugs-projects.html))
This dataset is published under the [Creative Commons BY-Attribution-Share Alike 4.0 (International) licence](https://creativecommons.org/licenses/by-sa/4.0/).
## Downloads
* **Problems full** [ [Download JSON](problems_full.tar.bz2) ] -- A list of all problems, exported as JSON (one problem per file).
* Content: 125250 entries, 22 attributes
* Size: 38M compressed, 904M raw
* **Problems extract** [ [Download CSV](problems_extract.csv.bz2) ] -- A list of all problems, exported as CSV (one big file).
* Content: 125250 entries, 22 attributes
* Size: 1.5M compressed, 14M raw
* **Incidents full** [ [Download JSON](incidents_full.tar.bz2) ] -- A list of all incidents, exported as JSON (one incident per file).
* Content: 2084363 entries, 22 attributes
* Size: 820M compressed, 19G raw
* **Incidents extract** [ [Download CSV](incidents_extract.csv.bz2) ] -- A list of all incidents, exported as CSV (one big file).
* Content: 2084045 entries, 20 attributes
* Size: 141M compressed, 778M raw
* **Incidents Bundles** [ [Download CSV](incidents_bundles_extract.csv.bz2) ] -- A list of all bundles found in incidents, exported as CSV. Attributes are bundle_name, bundle_version, and number of occurrences.
* Content: 29709 entries, 3 attributes
* Size: 220K compressed, 1.5M raw
**Documentation**
* **Stacktraces Problems analysis document** [ [Download PDF](problems_analysis.pdf) | [Download Rmd](problems_analysis.rmd) ] -- A R Markdown document to analyse the Stacktraces problem dataset, with description of the actual content and examples of usage.
* **Stacktraces Incidents analysis document** [ [Download PDF](incidents_analysis.pdf) | [Download Rmd](incidents_analysis.rmd) ] -- A R Markdown document to analyse the Stacktraces incidents dataset, with description of the actual content and examples of usage.
More information about the AERI system can be found on the [Code Trails website](https://www.codetrails.com/error-analytics/manual/).
## Privacy concerns
The result contains no email address, user id or machine id. Rather than removing the information (we are not sure that we remove all required information) we decided to simply pick relevant information from the file and push it into the output.
End users have an option to keep their own class names private. We have presently no simple means to know what stacktraces in the database extraction should be kept private, so we decided to play it safe and hide class names whose packages don't start with known prefixes [1]. All private classnames have been replaced by the HIDDEN keyword.
[1] `"ch.qos.*", "com.cforcoding.*", "com.google.*", "com.gradleware.tooling.*", "com.mountainminds.eclemma.*", "com.naef.*", "com.sun.*", "java.*", "javafx.*", "javax.*", "org.apache.*", "org.eclipse.*", "org.fordiac.*", "org.gradle.*", "org.jacoco.*", "org.osgi.*", "org.slf4j.*", "sun.*" `
## Format: problems
{
"summary": "",
"osgiArch": "",
"osgiOs": "",
"osgiOsVersion": "",
"osgiWs": "",
"eclipseBuildId": "",
"eclipseProduct": "",
"javaRuntimeVersion": "",
"numberOfIncidents": 0,
"numberOfReporters": 74,
"stacktraces": [
[ "stacktrace for incident" ],
[ "stacktrace for cause" ],
[ "stacktrace for exception" ]
]
}
## Format: incidents
{
"eclipseBuildId":"4.6.1.M20160907-1200",
"eclipseProduct":"org.eclipse.epp.package.jee.product",
"javaRuntimeVersion":"1.8.0_112-b15",
"osgiArch":"x86_64",
"osgiOs":"Windows7",
"osgiOsVersion":"6.1.0",
"osgiWs":"win32",
"stacktraces":[
[ "stacktrace" ]
],
"summary": "Failed to retrieve default libraries for jre1.8.0_111"
}
## Format: Stacktraces
The structure used in the mongodb for stacktraces has been kept as is: it is composed of fields with all information relevant to each line of the stacktrace. Each stacktrace is an array of objects as shown below:
[
{
"cN": "sun.net.www.http.HttpClient",
"mN": "parseHTTPHeader",
"fN": "HttpClient.java",
"lN": 786,
}
]
File deleted
This diff is collapsed.
This diff is collapsed.
File deleted
This diff is collapsed.
<!DOCTYPE html><html><head>
<title>scava_aeri_readme</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="stylesheet" href="file:////home/boris/.atom/packages/markdown-preview-enhanced/node_modules/@shd101wyy/mume/dependencies/katex/katex.min.css">
<style>
/**
* prism.js Github theme based on GitHub's theme.
* @author Sam Clarke
*/
code[class*="language-"],
pre[class*="language-"] {
color: #333;
background: none;
font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace;
text-align: left;
white-space: pre;
word-spacing: normal;
word-break: normal;
word-wrap: normal;
line-height: 1.4;
-moz-tab-size: 8;
-o-tab-size: 8;
tab-size: 8;
-webkit-hyphens: none;
-moz-hyphens: none;
-ms-hyphens: none;
hyphens: none;
}
/* Code blocks */
pre[class*="language-"] {
padding: .8em;
overflow: auto;
/* border: 1px solid #ddd; */
border-radius: 3px;
/* background: #fff; */
background: #f5f5f5;
}
/* Inline code */
:not(pre) > code[class*="language-"] {
padding: .1em;
border-radius: .3em;
white-space: normal;
background: #f5f5f5;
}
.token.comment,
.token.blockquote {
color: #969896;
}
.token.cdata {
color: #183691;
}
.token.doctype,
.token.punctuation,
.token.variable,
.token.macro.property {
color: #333;
}
.token.operator,
.token.important,
.token.keyword,
.token.rule,
.token.builtin {
color: #a71d5d;
}
.token.string,
.token.url,
.token.regex,
.token.attr-value {
color: #183691;
}
.token.property,
.token.number,
.token.boolean,
.token.entity,
.token.atrule,
.token.constant,
.token.symbol,
.token.command,
.token.code {
color: #0086b3;
}
.token.tag,
.token.selector,
.token.prolog {
color: #63a35c;
}
.token.function,
.token.namespace,
.token.pseudo-element,
.token.class,
.token.class-name,
.token.pseudo-class,
.token.id,
.token.url-reference .token.variable,
.token.attr-name {
color: #795da3;
}
.token.entity {
cursor: help;
}
.token.title,
.token.title .token.punctuation {
font-weight: bold;
color: #1d3e81;
}
.token.list {
color: #ed6a43;
}
.token.inserted {
background-color: #eaffea;
color: #55a532;
}
.token.deleted {
background-color: #ffecec;
color: #bd2c00;
}
.token.bold {
font-weight: bold;
}
.token.italic {
font-style: italic;
}
/* JSON */
.language-json .token.property {
color: #183691;
}
.language-markup .token.tag .token.punctuation {
color: #333;
}
/* CSS */
code.language-css,
.language-css .token.function {
color: #0086b3;
}
/* YAML */
.language-yaml .token.atrule {
color: #63a35c;
}
code.language-yaml {
color: #183691;
}
/* Ruby */
.language-ruby .token.function {
color: #333;
}
/* Markdown */
.language-markdown .token.url {
color: #795da3;
}
/* Makefile */
.language-makefile .token.symbol {
color: #795da3;
}
.language-makefile .token.variable {
color: #183691;
}
.language-makefile .token.builtin {
color: #0086b3;
}
/* Bash */
.language-bash .token.keyword {
color: #0086b3;
}
/* highlight */
pre[data-line] {
position: relative;
padding: 1em 0 1em 3em;
}
pre[data-line] .line-highlight-wrapper {
position: absolute;
top: 0;
left: 0;
background-color: transparent;
display: block;
width: 100%;
}
pre[data-line] .line-highlight {
position: absolute;
left: 0;
right: 0;
padding: inherit 0;
margin-top: 1em;
background: hsla(24, 20%, 50%,.08);
background: linear-gradient(to right, hsla(24, 20%, 50%,.1) 70%, hsla(24, 20%, 50%,0));
pointer-events: none;
line-height: inherit;
white-space: pre;
}
pre[data-line] .line-highlight:before,
pre[data-line] .line-highlight[data-end]:after {
content: attr(data-start);
position: absolute;
top: .4em;
left: .6em;
min-width: 1em;
padding: 0 .5em;
background-color: hsla(24, 20%, 50%,.4);
color: hsl(24, 20%, 95%);
font: bold 65%/1.5 sans-serif;
text-align: center;
vertical-align: .3em;
border-radius: 999px;
text-shadow: none;
box-shadow: 0 1px white;
}
pre[data-line] .line-highlight[data-end]:after {
content: attr(data-end);
top: auto;
bottom: .4em;
}html body{font-family:"Helvetica Neue",Helvetica,"Segoe UI",Arial,freesans,sans-serif;font-size:16px;line-height:1.6;color:#333;background-color:#fff;overflow:initial;box-sizing:border-box;word-wrap:break-word}html body>:first-child{margin-top:0}html body h1,html body h2,html body h3,html body h4,html body h5,html body h6{line-height:1.2;margin-top:1em;margin-bottom:16px;color:#000}html body h1{font-size:2.25em;font-weight:300;padding-bottom:.3em}html body h2{font-size:1.75em;font-weight:400;padding-bottom:.3em}html body h3{font-size:1.5em;font-weight:500}html body h4{font-size:1.25em;font-weight:600}html body h5{font-size:1.1em;font-weight:600}html body h6{font-size:1em;font-weight:600}html body h1,html body h2,html body h3,html body h4,html body h5{font-weight:600}html body h5{font-size:1em}html body h6{color:#5c5c5c}html body strong{color:#000}html body del{color:#5c5c5c}html body a:not([href]){color:inherit;text-decoration:none}html body a{color:#08c;text-decoration:none}html body a:hover{color:#00a3f5;text-decoration:none}html body img{max-width:100%}html body>p{margin-top:0;margin-bottom:16px;word-wrap:break-word}html body>ul,html body>ol{margin-bottom:16px}html body ul,html body ol{padding-left:2em}html body ul.no-list,html body ol.no-list{padding:0;list-style-type:none}html body ul ul,html body ul ol,html body ol ol,html body ol ul{margin-top:0;margin-bottom:0}html body li{margin-bottom:0}html body li.task-list-item{list-style:none}html body li>p{margin-top:0;margin-bottom:0}html body .task-list-item-checkbox{margin:0 .2em .25em -1.8em;vertical-align:middle}html body .task-list-item-checkbox:hover{cursor:pointer}html body blockquote{margin:16px 0;font-size:inherit;padding:0 15px;color:#5c5c5c;border-left:4px solid #d6d6d6}html body blockquote>:first-child{margin-top:0}html body blockquote>:last-child{margin-bottom:0}html body hr{height:4px;margin:32px 0;background-color:#d6d6d6;border:0 none}html body table{margin:10px 0 15px 0;border-collapse:collapse;border-spacing:0;display:block;width:100%;overflow:auto;word-break:normal;word-break:keep-all}html body table th{font-weight:bold;color:#000}html body table td,html body table th{border:1px solid #d6d6d6;padding:6px 13px}html body dl{padding:0}html body dl dt{padding:0;margin-top:16px;font-size:1em;font-style:italic;font-weight:bold}html body dl dd{padding:0 16px;margin-bottom:16px}html body code{font-family:Menlo,Monaco,Consolas,'Courier New',monospace;font-size:.85em !important;color:#000;background-color:#f0f0f0;border-radius:3px;padding:.2em 0}html body code::before,html body code::after{letter-spacing:-0.2em;content:"\00a0"}html body pre>code{padding:0;margin:0;font-size:.85em !important;word-break:normal;white-space:pre;background:transparent;border:0}html body .highlight{margin-bottom:16px}html body .highlight pre,html body pre{padding:1em;overflow:auto;font-size:.85em !important;line-height:1.45;border:#d6d6d6;border-radius:3px}html body .highlight pre{margin-bottom:0;word-break:normal}html body pre code,html body pre tt{display:inline;max-width:initial;padding:0;margin:0;overflow:initial;line-height:inherit;word-wrap:normal;background-color:transparent;border:0}html body pre code:before,html body pre tt:before,html body pre code:after,html body pre tt:after{content:normal}html body p,html body blockquote,html body ul,html body ol,html body dl,html body pre{margin-top:0;margin-bottom:16px}html body kbd{color:#000;border:1px solid #d6d6d6;border-bottom:2px solid #c7c7c7;padding:2px 4px;background-color:#f0f0f0;border-radius:3px}@media print{html body{background-color:#fff}html body h1,html body h2,html body h3,html body h4,html body h5,html body h6{color:#000;page-break-after:avoid}html body blockquote{color:#5c5c5c}html body pre{page-break-inside:avoid}html body table{display:table}html body img{display:block;max-width:100%;max-height:100%}html body pre,html body code{word-wrap:break-word;white-space:pre}}.markdown-preview{width:100%;height:100%;box-sizing:border-box}.markdown-preview .pagebreak,.markdown-preview .newpage{page-break-before:always}.markdown-preview pre.line-numbers{position:relative;padding-left:3.8em;counter-reset:linenumber}.markdown-preview pre.line-numbers>code{position:relative}.markdown-preview pre.line-numbers .line-numbers-rows{position:absolute;pointer-events:none;top:1em;font-size:100%;left:0;width:3em;letter-spacing:-1px;border-right:1px solid #999;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none}.markdown-preview pre.line-numbers .line-numbers-rows>span{pointer-events:none;display:block;counter-increment:linenumber}.markdown-preview pre.line-numbers .line-numbers-rows>span:before{content:counter(linenumber);color:#999;display:block;padding-right:.8em;text-align:right}.markdown-preview .mathjax-exps .MathJax_Display{text-align:center !important}.markdown-preview:not([for="preview"]) .code-chunk .btn-group{display:none}.markdown-preview:not([for="preview"]) .code-chunk .status{display:none}.markdown-preview:not([for="preview"]) .code-chunk .output-div{margin-bottom:16px}.scrollbar-style::-webkit-scrollbar{width:8px}.scrollbar-style::-webkit-scrollbar-track{border-radius:10px;background-color:transparent}.scrollbar-style::-webkit-scrollbar-thumb{border-radius:5px;background-color:rgba(150,150,150,0.66);border:4px solid rgba(150,150,150,0.66);background-clip:content-box}html body[for="html-export"]:not([data-presentation-mode]){position:relative;width:100%;height:100%;top:0;left:0;margin:0;padding:0;overflow:auto}html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{position:relative;top:0}@media screen and (min-width:914px){html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{padding:2em calc(50% - 457px)}}@media screen and (max-width:914px){html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{padding:2em}}@media screen and (max-width:450px){html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{font-size:14px !important;padding:1em}}@media print{html body[for="html-export"]:not([data-presentation-mode]) #sidebar-toc-btn{display:none}}html body[for="html-export"]:not([data-presentation-mode]) #sidebar-toc-btn{position:fixed;bottom:8px;left:8px;font-size:28px;cursor:pointer;color:inherit;z-index:99;width:32px;text-align:center;opacity:.4}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] #sidebar-toc-btn{opacity:1}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc{position:fixed;top:0;left:0;width:300px;height:100%;padding:32px 0 48px 0;font-size:14px;box-shadow:0 0 4px rgba(150,150,150,0.33);box-sizing:border-box;overflow:auto;background-color:inherit}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc::-webkit-scrollbar{width:8px}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc::-webkit-scrollbar-track{border-radius:10px;background-color:transparent}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc::-webkit-scrollbar-thumb{border-radius:5px;background-color:rgba(150,150,150,0.66);border:4px solid rgba(150,150,150,0.66);background-clip:content-box}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc a{text-decoration:none}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc ul{padding:0 1.6em;margin-top:.8em}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc li{margin-bottom:.8em}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc ul{list-style-type:none}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .markdown-preview{left:300px;width:calc(100% - 300px);padding:2em calc(50% - 457px - 150px);margin:0;box-sizing:border-box}@media screen and (max-width:1274px){html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .markdown-preview{padding:2em}}@media screen and (max-width:450px){html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .markdown-preview{width:100%}}html body[for="html-export"]:not([data-presentation-mode]):not([html-show-sidebar-toc]) .markdown-preview{left:50%;transform:translateX(-50%)}html body[for="html-export"]:not([data-presentation-mode]):not([html-show-sidebar-toc]) .md-sidebar-toc{display:none}
/* Please visit the URL below for more information: */
/* https://shd101wyy.github.io/markdown-preview-enhanced/#/customize-css */
</style>
</head>
<body for="html-export">
<div class="mume markdown-preview ">
<h1 class="mume-header" id="release-notes-for-the-aeri-dataset">Release notes for the AERI dataset</h1>
<p>The database dump contains information about errors encountered by people when using Eclipse. It is composed of several mongodb tables and uses the bson format. Only two tables contain stack traces: <code>problems</code> and <code>incidents</code>.</p>
<p>The bson files can be read using the bsondump utility, provided with the mongodb client package (mongodb-clients on Debian).</p>
<pre data-role="codeBlock" data-info class="language-"><code>bsondump problems.bson --type json &gt; problems.json
</code></pre><p>After conversion the two files are quite big: 37GB for incidents and 2.1 GB for problems.</p>
<p>Unfortunately the utility adds some progress information in the UI that needs to be removed from the output:</p>
<pre data-role="codeBlock" data-info class="language-"><code>grep -v &apos;Progress: &apos; problems.json &gt; problems_clean.json
</code></pre><p>We also had to remove a few (approx. a dozen of) lines because they embed unparseable source code, characters or asian/binary/utf8/16/256 text. The script tries to JSON-decode all lines one by one, and on failure simply goes to the next line.</p>
<p>For <code>problems</code> (the file is reasonably small) the script generates for each line a separate JSON file with only information related to that line. The script for problems extraction is <code>parse_json_problems.pl</code>. Output is 820MB and processing time is roughly 45mn.</p>
<p>For <code>incidents</code> (file is 37GB) the script generates for each line a separate JSON file with only information related to that line. For the records, trying to generate a single file requires at least twice the size of the file in RAM/SWAP (i.e. roughly 74GB). There are 2084328 files in the output for 17GB. The script for incidents extraction is <code>parse_json_incidents.pl</code>. To get an idea of the resources required to process that, the final incidents extraction took roughly 16h on a quite powerful box.</p>
</div>
</body></html>
\ No newline at end of file
# Release notes for the AERI dataset
The database dump contains information about errors encountered by people when using Eclipse. It is composed of several mongodb tables and uses the bson format. Only two tables contain stack traces: `problems` and `incidents`.
The bson files can be read using the bsondump utility, provided with the mongodb client package (mongodb-clients on Debian).
```
bsondump problems.bson --type json > problems.json
```
After conversion the two files are quite big: 37GB for incidents and 2.1 GB for problems.
Unfortunately the utility adds some progress information in the UI that needs to be removed from the output:
```
grep -v 'Progress: ' problems.json > problems_clean.json
```
We also had to remove a few (approx. a dozen of) lines because they embed unparseable source code, characters or asian/binary/utf8/16/256 text. The script tries to JSON-decode all lines one by one, and on failure simply goes to the next line.
For `problems` (the file is reasonably small) the script generates for each line a separate JSON file with only information related to that line. The script for problems extraction is `parse_json_problems.pl`. Output is 820MB and processing time is roughly 45mn.
For `incidents` (file is 37GB) the script generates for each line a separate JSON file with only information related to that line. For the records, trying to generate a single file requires at least twice the size of the file in RAM/SWAP (i.e. roughly 74GB). There are 2084328 files in the output for 17GB. The script for incidents extraction is `parse_json_incidents.pl`. To get an idea of the resources required to process that, the final incidents extraction took roughly 16h on a quite powerful box.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
---
title: "Eclipse mailing lists"
subtitle: "R Analysis document"
author: "Boris Baldassari -- Castalia Solutions"
output:
html_document:
toc: yes
toc_depth: 2
pdf_document:
toc: yes
toc_depth: 3
word_document:
toc: yes
toc_depth: '2'
params:
project: eclipse_mls_full.csv
---
```{r init, message=FALSE, echo=FALSE}
library(ggplot2)
library(plotly)
library(ggthemes)
library(knitr)
library(kableExtra)
library(parsedate)
library(magrittr)
# Read csv file
file.in <- paste( params$project, sep="")
project.csv <- read.csv(file.in, header=T)
names.orig <- names(project.csv)
project.csv$Company <- substr(x = project.csv$sender_addr, 18, 33)
# Create xts object
require(xts)
require(parsedate)
project.xts <- xts(x = project.csv, order.by = parse_iso_8601(project.csv$sent_at))
# Initialise plotly
Sys.setenv("plotly_username"="BorisBaldassari")
Sys.setenv("plotly_api_key"="uewufFf4T6NTGwbU6uS5")
```
# Introduction
## About this dataset
The [Eclipse Foundation](https://eclipse.org) provides individuals and organizations with a commercially focused environment for open source software innovation. It includes git repositories, reviews, issues management, continuous integration, forums and mailing lists among other services. Many well-known and widely used projects are hosted on the forge, including the Eclipse IDE itself, The new Java working group,
This dataset is a dump of all posts sent on all mailing lists hosted at the Eclipse Forge. Although this is public data (the mailing lists can be browsed on the [official mailman page](https://accounts.eclipse.org/mailing-list)) all data has been anonymised to prevent any misuse. The privacy issues identified, along with the anonymisation process, have been covered in a [dedicated document](../../docs/datasets_privacy.html).
These files are published under the [Creative Commons BY-Attribution-Share Alike 4.0 (International) licence](https://creativecommons.org/licenses/by-sa/4.0/).
The dataset is composed of two parts:
* **eclipse_mls_full.csv** contains an extract of all the messages exchanged on the various mailing lists. The present document uses this CSV as input data.
* The **full list of mboxes**, one file per mailing list. They can be downloaded directly from the [mboxes subdirectory](mboxes/).
All of them are updated weekly at 2am on Sunday.
## Basic summary
* **Generated date**: `r date()`
* **First date**: `r first(index(project.xts))`
* **Last date**: `r last(index(project.xts))`
* **Number of posts**: `r nrow(project.xts)`
* **Number of attributes**: `r ncol(project.xts)`
## Privacy concerns
We value privacy and intend to make everything we can to prevent misuse of the dataset. If you think we failed somewhere in the process, please [let us know](https://www.crossminer.org/contact) so we can do better.
All personally identifiable information has been scrambled using the [data anonymiser](https://github.com/borisbaldassari/data-anonymiser) Perl module. As a result there is **no clear email address** in this dataset, **nor any UUID or name**. However all identical information produces the same encrypted string, which means that one can still identify identical data without knowing what it actually is. As an example email addresses are split (name, company) and encoded separately, which enables one to e.g. identify posters from the same company without knowing the company.
The anonymisation technique used basically encrypts information and then throws away the private key. Please refer to the [documentation published on github](https://github.com/borisbaldassari/data-anonymiser) for more details.
## About this document
This document is a [R Markdown document](http://rmarkdown.rstudio.com) and is composed of both text (like this one) and dynamically computed information (mostly in the sections below) executed on the data itself. This ensures that the documentation is always synchronised with the data, and serves as a test suite for the dataset.
# Structure of data
This dataset is composed of a single big CSV file. Attributes are: ``r names.orig``.
Examples are provided at the end of this file to demonstrate how to use it in R.
## list {#list}
* Description: The mailing list and project of the post.
* Type: String
Examples:
```{r list.sample, warning=FALSE, echo=F, results='asis'}
extract <- sample(unique(project.csv$list), size=5)
kable( extract, caption="Sample of list names", col.names = c('Project list names'))
```
## messageId {#message_id}
* Description: A unique identifier for the post.
* Type: String (Scrambled Base64)
Examples:
```{r messageid.sample, warning=FALSE, echo=FALSE, results='asis'}
extract <- sample(unique(project.csv$messageid), size=5)
kable( extract, caption="Sample of message IDs", col.names = c('Message ID'))
```
## Subject {#subject}
* Description: The subject of the post as sent on the mailing list.
* Type: String
Examples:
```{r subject.sample, warning=FALSE, echo=FALSE, results='asis'}
extract <- sample(unique(project.csv$subject), size=5)
kable( extract, caption="Sample of email subjects", col.names = c('Subject'))
```
## Sent at {#sent_at}
* Description: The time of sending for the post.
* Type: Date (ISO 8601)
Main characteristics:
* **First date**: `r first(index(project.xts))`
* **Last date**: `r last(index(project.xts))`
Examples:
```{r sentat.sample, warning=FALSE, echo=FALSE, results='asis'}
extract <- sample(project.csv$sent_at, size=5)
kable( extract, caption="Sample of sent dates", col.names = c('Sent date'))
```
## Sender name
* Description: The name of the sender of the post.
* Type: String (Scrambled Base64)
* Number of unique entries: `r length(unique(project.csv$sender_name))`
Examples:
```{r sendername.sample, warning=FALSE, echo=FALSE, results='asis'}
extract <- sample(project.csv$sender_name, size=5)
kable( extract, caption="Sample of sender names", col.names = c('Sender names'))
```
Note: A single name repeated several times will always result in the same scrambled ID. This way it is possible to identify same-author posts without actually knowing the name of the sender.
## Sender address
* Description: The email address of the sender, encoded.
* Type: String (Scrambled Base64)
* Number of unique entries: `r length(unique(project.csv$sender_addr))`
Examples:
```{r senderaddr.sample, warning=FALSE, echo=FALSE, results='asis'}
extract <- sample(project.csv$sender_addr, size=5)
kable( extract, caption="Sample of sender addresses", col.names = c('Sender addresses'))
```
Note: A single email address repeated several times will always result in the same scrambled email address. Furthermore both parts of the email (name, company) are individually scrambled, which means that one can identify email addresses from the same company without actually knowing the real company or name of the sender.
# Using the dataset
## Reading CSV file
Reading file from `r file.in`.
```{r examples.init, echo=T}
project.csv <- read.csv(file.in, header=T)
```
We add a column for the Company, which we extract from the email address (i.e. the domain name):
```{r examples.init.comp, echo=T}
project.csv$Company <- substr(x = project.csv$sender_addr, 18, 33)
```
Number of columns in this dataset:
```{r examples.ncol, echo=T}
ncol(project.csv)
```
Number of entries in this dataset:
```{r examples.nrow, echo=T}
nrow(project.csv)
```
Names of columns:
```{r examples.names, echo=T}
names(project.csv)
```
## Using time series (xts)
The dataset needs to be converted to a `xts` object. We can use the `sent_at` attribute as a time index.
```{r examples.xts, echo=T}
require(xts)
project.xts <- xts(x = project.csv, order.by = parse_iso_8601(project.csv$sent_at))
```
## Plotting number of monthly posts
When considering the timeline of the dataset, it can be misleading when there several submissions on a short period of time, compared to sparse time ranges. We'll use the `apply.monthly` function from `xts` to normalise the total number of monthly submissions.
```{r examples.xts.plot}
project.monthly <- apply.monthly(x=project.xts$sent_at, FUN=nrow)
autoplot(project.monthly, geom='line') +
theme_minimal() + ylab("Number of posts") + xlab("Time") + ggtitle("Number of monthly posts")
```
## Plotting number of monthly reporters
One author can post several emails on the mailing list. Let's plot the monthly number of distinct authors on the mailing list. For this we need to count the number of unique occurrences of the email address (attribute `sender_attr`).
```{r xts.monthly.reporters}
count_unique <- function(x) { length(unique(x)) }
project.monthly <- apply.monthly(x=project.xts$sender_addr, FUN=count_unique)
autoplot(project.monthly, geom='line') +
theme_minimal() + ylab("Number of authors") + xlab("Time") + ggtitle("Number of monthly distinct authors")
```
## Plotting activity of authors
We want to plot the number of emails sent by each author regardless of the mailing list they were sent on. We display only the 10 top posters:
```{r reporters.sample, warning=FALSE, echo=FALSE, results='asis'}
authors <- sort(x = table(project.csv$sender_addr), decreasing = TRUE)
authors.10 <- head( authors, n = 10)
authors.subset.df <- as.data.frame(authors.10)
authors.subset.df$Company <- substr(x = authors.subset.df$Var1, 18, 33)
kable( authors.subset.df, caption="Top 10 senders on mailing lists", col.names = c('Sender address', 'Number of posts', 'Company'))
```
```{r reporters.plot.init, echo=F}
n <- 50
```
Now plot these `r n` top posters with ggplot and use the company (i.e. second part of the email address) for the colour:
```{r reporters.plot}
authors.subset <- head( authors, n = n)
authors.subset.df <- as.data.frame(authors.subset)
names(authors.subset.df) <- c('ID', 'Posts')
authors.subset.df$Author <- substr(x = authors.subset.df$ID, 1, 16)
authors.subset.df$Company <- substr(x = authors.subset.df$ID, 18, 33)
p <- ggplot(data=authors.subset.df, aes(x=reorder(Author, -Posts), y = Posts, fill = Company)) +
geom_bar(stat="identity") +
theme_minimal() + ylab("Number of posts") + xlab('Posters') +
ggtitle(paste(n, " overall top posters on Eclipse mailing lists", sep="")) +
theme( axis.text.x = element_text(angle=60, size = 7, hjust = 1))
g <- ggplotly(p)
g
#api_create(g, filename = "r-eclipse_mls_authors")
```
## Posts by Company
We want to know what companies posted the most messages in mailing listsacross years. To that end we select the 20 companies that have the larger number of posts and plot the number of messages by company year after year.
```{r comp.init}
comps_list <- head( sort( x = table(project.csv$Company), decreasing = T ), n=20 )
df <- data.frame(Company=character(),
Year=character(),
Posts=integer(),
stringsAsFactors=FALSE)
for (i in seq_along(1:20)) {
project.comp.xts <- project.xts[project.xts$Company == names(comps_list)[[i]],]
project.comp.yearly <- apply.yearly(x=project.comp.xts$Company, FUN=nrow)
for (j in seq_along(1:nrow(project.comp.yearly))) {
year <- format(index(project.comp.yearly)[[j]],"%Y")
comp <- as.data.frame(t(c(names(comps_list)[[i]], year, as.integer(project.comp.yearly[[j]]))))
names(comp) <- c("Company", "Year", "Posts")
df <- rbind(df, comp)
}
}
df$Company <- as.character(df$Company)
df <- df[order(df$Company),]
p <- ggplot(data=df, aes(x=Year, y = Posts, fill = Company)) + geom_bar(stat="identity") +
theme_minimal() + ylab("Number of posts") + xlab('Years') +
ggtitle("Top 20 Companies involved in Eclipse mailing lists across years") +
theme( axis.text.x = element_text(angle=60, size = 7, hjust = 1))
g <- ggplotly(p)
g
#api_create(g, filename = "r-eclipse_mls_companies")
```
This diff is collapsed.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment