From fca00a90a6537267ba1fbdcb1627ba04abf44dc6 Mon Sep 17 00:00:00 2001 From: Alberto Pianon <alberto@pianon.eu> Date: Tue, 1 Nov 2022 17:23:11 +0100 Subject: [PATCH] bash script to find upstream sources for files with missung reuse tags --- development/reuse/git_find_missing_reuse.sh | 83 +++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 development/reuse/git_find_missing_reuse.sh diff --git a/development/reuse/git_find_missing_reuse.sh b/development/reuse/git_find_missing_reuse.sh new file mode 100644 index 0000000..3f51657 --- /dev/null +++ b/development/reuse/git_find_missing_reuse.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# usage: +# +# 1. clone repo containing files with missing reuse tags (needles) +# 2. clone "haystack" upstream repos (where to look for needles) in a separate +# directory (eg. haystack/) +# a. for each cloned repo, fetch *all* branches (see +# https://stackoverflow.com/a/10312587 ) +# 3. search needle files in haystack repos and genarate a csv report: +# ./git_find_missing_reuse.sh <needle-dir> <haystack_dir> > report.csv + +needle_dir=$1 +haystack_dir=$2 + +get_files () { + to_add="" + reuse lint | while read line; do + if [ -n "$to_add" ]; then + if [ -n "$line" ]; then + echo "${line/* /}" + else + to_add="" + fi + elif [ "$line" == "The following files have no copyright and licensing information:" ]; then + to_add="1" + fi + done +} + +join() { + local IFS=";" + echo "$*" +} + +get_branches () { + branch_array=( $(git branch --contains $commit | sed -E 's/^(\* | )//' ) ) + join "${branch_array[@]}" +} + + +find_object () { + git log --all --pretty=tformat:'%ct %T %h %s' --find-object=$obj_hash | sort -r | tail -1 | \ + if read timestamp tree commit comment; then + git ls-tree -r $tree | grep $obj_hash | while read perm obj_type obj_hash path; do + branches=$(get_branches) + echo "$sha1,$needle_path,$remote_uri@$commit:$path,$branches" + done + fi +} + +upstream_repos=$(for d in $haystack_dir/*/.git; do echo $d | sed -E 's/\/\.git$//'; done) + +cd $needle_dir +files=$( + for f in $(get_files); do + hash=$(git hash-object $f); + sha1=$(sha1sum $f | cut -d" " -f 1) + echo "$sha1 $hash $f"; + done +) +cd - > /dev/null + +echo "sha1sum,path,upstream,upstream_branches" # csv header + +echo "$files" | while read sha1 obj_hash needle_path; do + found_objs=() + for repo in $upstream_repos; do + cd $repo + remote_uri=$(git remote -v | tail -1 | (read remote uri type; echo $uri)) + found_obj="$(find_object)" + if [ -n "$found_obj" ]; then + found_objs+=( "$found_obj" ) + fi + cd - > /dev/null + done + found_objs=`printf '%s\n' "${found_objs[@]}"` + if [ -n "$found_objs" ]; then + echo "$found_objs" + else + echo "$sha1,$needle_path,NOT_FOUND,NOT_FOUND" + fi +done -- GitLab