From fca00a90a6537267ba1fbdcb1627ba04abf44dc6 Mon Sep 17 00:00:00 2001
From: Alberto Pianon <alberto@pianon.eu>
Date: Tue, 1 Nov 2022 17:23:11 +0100
Subject: [PATCH] bash script to find upstream sources for files with missung
 reuse tags

---
 development/reuse/git_find_missing_reuse.sh | 83 +++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 development/reuse/git_find_missing_reuse.sh

diff --git a/development/reuse/git_find_missing_reuse.sh b/development/reuse/git_find_missing_reuse.sh
new file mode 100644
index 0000000..3f51657
--- /dev/null
+++ b/development/reuse/git_find_missing_reuse.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+# usage:
+#
+# 1. clone repo containing files with missing reuse tags (needles)
+# 2. clone "haystack" upstream repos (where to look for needles) in a separate
+#    directory (eg. haystack/)
+#      a. for each cloned repo, fetch *all* branches (see
+#         https://stackoverflow.com/a/10312587 )
+# 3. search needle files in haystack repos and genarate a csv report:
+#    ./git_find_missing_reuse.sh <needle-dir> <haystack_dir> > report.csv
+
+needle_dir=$1
+haystack_dir=$2
+
+get_files () {
+  to_add=""
+  reuse lint | while read line; do
+    if [ -n "$to_add" ]; then
+      if [ -n "$line" ]; then
+        echo "${line/* /}"
+      else
+        to_add=""
+      fi
+    elif [ "$line" == "The following files have no copyright and licensing information:" ]; then
+      to_add="1"
+    fi
+  done
+}
+
+join() {
+  local IFS=";"
+  echo "$*"
+}
+
+get_branches () {
+  branch_array=( $(git branch --contains $commit | sed -E 's/^(\* |  )//' ) )
+  join "${branch_array[@]}"
+}
+
+
+find_object () {
+  git log --all --pretty=tformat:'%ct %T %h %s' --find-object=$obj_hash | sort -r | tail -1 | \
+    if read timestamp tree commit comment; then
+      git ls-tree -r $tree | grep $obj_hash | while read perm obj_type obj_hash path; do
+        branches=$(get_branches)
+        echo "$sha1,$needle_path,$remote_uri@$commit:$path,$branches"
+      done
+    fi
+}
+
+upstream_repos=$(for d in $haystack_dir/*/.git; do echo $d | sed -E 's/\/\.git$//'; done)
+
+cd $needle_dir
+files=$(
+  for f in $(get_files); do
+    hash=$(git hash-object $f);
+    sha1=$(sha1sum $f | cut -d" " -f 1)
+    echo "$sha1 $hash $f";
+  done
+)
+cd - > /dev/null
+
+echo "sha1sum,path,upstream,upstream_branches"  # csv header
+
+echo "$files" | while read sha1 obj_hash needle_path; do
+  found_objs=()
+  for repo in $upstream_repos; do
+    cd $repo
+    remote_uri=$(git remote -v | tail -1 | (read remote uri type; echo $uri))
+    found_obj="$(find_object)"
+    if [ -n "$found_obj" ]; then
+       found_objs+=( "$found_obj" )
+    fi
+    cd - > /dev/null
+  done
+  found_objs=`printf '%s\n' "${found_objs[@]}"`
+  if [ -n "$found_objs" ]; then
+    echo "$found_objs"
+  else
+    echo "$sha1,$needle_path,NOT_FOUND,NOT_FOUND"
+  fi
+done
-- 
GitLab