#!/bin/bash # $Id$ # Acknowledgment: # The following code is a no-modified version (except for this comment, # the `Id' comment, and the inline_doc fragment) of an original work written by # Ken Moffat and is included here with his permission. # # FARCE: Farce Assists Rebuild Comparison Evaluation ;) # # to answer the question "can it rebuild itself?" # # We expect four arguments - first directory path, filelist # containing the files in this directory which we wish to compare, # second directory path, filelist for second directory. # # Yes, we could just compare everything in each tree, but the # filelist script knows about files it can reasonably ignore, # and this also allows us to build a sytem, boot it and get a # list of files, build a full desktop environment, and only then # build and boot the "can it build itself" test system and get # _its_ filelist. # # What this script aims to do: # ____________________________ # # First, report files not in both builds. # # Then, confirm symlinks point to same targets. # # After that, compare individual files - # if different, run the file name through 'expected' # to pick out files that are unlikely to match (logs, # pids, fstab [assumes '/' is a different device each time], # count these as 'expected'. # # For whatever is left, check the file type - ar archives # have their members extraced and compared (every member has # a timestamp), gzipped files are compared beyond their # timestamp, binaries, at least those using shared libs or # which are shared objects, are copied and subjected to # --strip-debug. If files match at this stage, count them as # 'accepted'. # # As a last step for any file that doesn't match, copy it # through some perl regexps to "process" it (convert any # date, time, kernel-version information from standard formats # into tokens, then see if the tokensi match. # # For details of the regexps, see the tokenize function. # Those files that match after this are also counted as # 'accepted'. Note that I don't always start from the kernel # version that I'm going to build, so this copes with e.g. perl # files that hardcode the kernel version. # # We now have files that don't match. A few of these seem to be # common to all builds - some (members of) c++ libraries or ar # archives, a few programs which perhaps use some sort of c++ code). # The file name # is passed to the 'failure' function - these # recognized filenames are labelled as 'predictable FAIL:', # anything else is labelled as 'unexpected FAIL:'. # # output: # stderr - files only in one of the builds, failure messages, # and totals. # # farce-results - more details, including which files were treated # as expected differences, files where neither copy could be read, # files treated as accepted, with the reason (and member for ar # archives). This data is typically up to 100 characters wide - # sometimes it's a bit more, but it doesn't wrap too badly in a # 100 character xterm. # # farce-extras - diffs for the files, or members, that didn't # match. This file is to establish new regexps for picking up # date/time/kernel-version formats. # # farce-identical - the names of the files which are identical # # farce-substitutions - whenever using tokenizeanddiff results in a # difference being accepted, for both versions diff the before and # after versions to show what got changed. If the file is a binary, # the output may still be hard to read. Note that I _know_ glibc # version strings pass one of the regexps looking for a kernel version # - since I expect you to use the same version of glibc for each # build, this is not a problem. # # farce-differ - the names of the files which could not be treated # as matching (whether or not I regard the failure as predictable) # for possible input to ICA processing. # # Copyright (C) 2005, 2006 Ken Moffat # # All rights reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or (at # your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or # NON INFRINGEMENT. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # : <&2 echo "$@" >&5 } function expected() { # if we expect it to differ because of its name, # allow it and report, return true ; else return false case $1 in /boot/grub/menu.lst) # just in case somebody puts this into the main filesystem true;; /etc/aliases.db) # some sort of database for postfix, not parsable true;; /etc/blkid.tab) # includes dev name for rootfs true;; /etc/fstab) # fstab, e.g. ' / ' will differ true;; /etc/group*) true;; /etc/hosts) # with dhcp client, I add current ip address to this in a hook true;; /etc/ld.so.*) # .conf and .cache can vary, # particularly if one system has a full build when I run this true;; /etc/lilo.conf|/etc/yaboot.conf) # bootloader control, I assume grub will all be on a separate true;; /etc/mtab) # at a minimum, different '/' true;; /etc/ntp.drift) true;; /etc/passwd*) true;; /etc/shadow*) true;; /etc/ssh/*key|/etc/ssh/*pub) # openssh keys true;; /misc/*) # where I put buildscripts (which mostly won't change) # and stamps containing name/time/space which will differ in the times true;; /root/*) # expect .bash_history etc to differ - if we can read them true;; /usr/bin/lynx) # part of my inital builds, I guess this uses anonymous namespaces true;; /usr/include/c++/*/*/bits/stdc++.h.gch/*) # precompiled headers true;; /usr/lib*/libstdc++.a|/usr/lib*/libstdc++.so*|/usr/lib*/libsupc++.a) # probably, anonymous namespaces # libstdc++.a, libstdc++.so.n.n.n, libsupc++.a true;; /usr/share/info/dir) # if one system has had extra stuff built, this will likely be bigger true;; /usr/share/man/whatis) # if one system has had extra stuff built, this will likely be bigger true;; /var/lib/locate/locatedb) # if one system has had extra stuff built, this will likely be bigger true;; /var/lib/nfs/*) # allow nfs bookkeeping true;; /var/log/*) true;; /var/run/utmp) true;; /var/spool/fcron*) true;; /var/state/*) # allow dhcp leases true;; /var/tmp/random-seed) true;; # following start with wildcards *Image*|*.PPCBoot*|*vmlinuz*|*lfskernel*) # compressed kernels, sometimes just building at a different # date/time is enough to change the length of them, because the # long format date and time is part of the compressed data true;; *pid*) # pids, including e.g. /var/spool/postfix/pid/* true;; *) # nothing else is expected to be different false;; esac if [ $? -eq 0 ]; then message "expected difference in $1" let expected=$expected+1 case $TYPE in AR) let EXPAR=$EXPAR+1 ;; ELF) let EXPELF=$EXPELF+1 ;; UNK) let EXPUNK=$EXPUNK+1 ;; # so far, no other valid types, so don't accumulate them *) emessage "internal error, expected difference for $1 of type $TYPE not allowed" exit 2 ;; esac true else false fi } function failure() { # first parm is filename or token # second parm is the error message # update the appropriate total # and write to both stderr and the results # by using emessage let different=$different+1 case $TYPE in AR) let DIFAR=$DIFAR+1 ;; ELF) let DIFELF=$DIFELF+1 ;; GZ) let DIFGZ=$DIFGZ+1 ;; SYM) let DIFSYM=$DIFSYM+1 ;; UNK) let DIFUNK=$DIFUNK+1 ;; *) emessage "internal error in failure() for TYPE $TYPE" exit 2 ;; esac test -f ${P1}$1 && echo $1 >&9 emessage "FAIL: $2" } function fatal() { # unrecoverable error echo $* exit 1 } function filetype() { TYPE=`file ${P1}${FILE}` case $TYPE in *'current ar archive'*) let TOTAR=$TOTAR+1 TYPE=AR ;; *' ELF '*) let TOTELF=$TOTELF+1 TYPE=ELF ;; *'gzip compressed data'*) let TOTGZ=$TOTGZ+1 TYPE=GZ ;; *) let TOTUNK=$TOTUNK+1 TYPE=UNK ;; esac } function message() { # write a string to $RESULT echo $* >&5 } function onlyone() { #report files only in one build # text should go to both stderr and the results, # but blank lines only go to the results if [ $1 == '<' ]; then emessage "File(s) only in the first build" else emessage "File(s) only in the second build" fi message "" FILES=`cat $DIFF | grep "^$1" | cut -d ' ' -f 2` for F in $FILES; do emessage $F let only=$only+1 done message "" } # 'test' functions are called with three arguments: # the two pathes and the filename # - we know the file is of this type, so see if we # can get it to match by reasonalbe means. # if not, treat it as different. # # NB if pathes are absolute, we need to prefix them # with the original $PWD to access the .a files # function testar() { # ar archives include timestamps for the members, # but diff doesn't show file timestamps unless the data differs # put out a message to help locate which archive any messages # about the members refer to. # try just stripping them U1,2 undebuggable U1=`mktemp` || fatal "cannot create a temporary file" U2=`mktemp` || fatal "cannot create a temporary file" cp ${1}${3} $U1 cp ${2}${3} $U2 strip --strip-debug $U1 strip --strip-debug $U2 cmp -s $U1 $U2 rm $U1 $U2 if [ $? -eq 0 ]; then let accepted=$accepted+1 let ACCAR=$ACCAR+1 message "archive $3 matches after strip --strip-debug" return fi # rest of this function retained primarily for pathologically bad builds # put out a message in the log to help identify which archive has issues. message "examining ar archive $3" D1=`mktemp -d` || fatal "cannot create a temporary directory" D2=`mktemp -d` || fatal "cannot create a temporary directory" cd $D1 ar -x ${OP1}${1}${3} cd $D2 ar -x ${OP2}${2}${3} cd # diff the members - true means they match diff -Na $D1 $D2 >/dev/null if [ $? -eq 0 ]; then message "accept: $3 after diffing the members" let accepted=$accepted+1 let ACCAR=$ACCAR+1 else # process individual members to eliminate date/time/kernel-version # first, check the members are the same M1=`mktemp` || fatal "cannot create a temporary file" M2=`mktemp` || fatal "cannot create a temporary file" cd $D1 MEMBERS= for F in *; do MEMBERS="$MEMBERS $F" done cd echo $MEMBERS | sort >$M1 cd $D2 MEMBERS= for F in *; do MEMBERS="$MEMBERS $F" done cd echo $MEMBERS | sort >$M2 cmp -s $M1 $M2 if [ $? -ne 0 ]; then # oh dear, different members echo "list of members differs for archive $3" >&6 diff $M1 $M2 >&6 failure $3 "$3 list of members differs" else # members (names) are same, # process each one STATUS=0 for M in $MEMBERS; do #avoid firing up perl on matching members cmp -s $D1/$M $D2/$M if [ $? -ne 0 ]; then tokenizeanddiff $D1/$M $D2/$M $FILE:$M if [ $? -eq 0 ]; then message "member $M matches after processing" else message "member $M DIFFERS after processing" STATUS=1 fi fi done if [ $STATUS -eq 0 ]; then let accepted=$accepted+1 let ACCAR=$ACCAR+1 else let different=$different+1 let DIFAR=$DIFAR+1 echo $3 >&9 emessage "FAIL: in $3" fi fi rm $M1 $M2 fi rm -rf $D1 $D2 } function testgzip() { # bytes 4,5,6,7 are the timestamp, so ignore these cmp -s -i 8 ${1}${3} ${2}${3} if [ $? -eq 0 ]; then message "accept: $3 after ignoring gzip timestamp" let accepted=$accepted+1 let ACCGZ=$ACCGZ+1 else failure $3 " $3 even after ignoring gzip timestamp" fi } function testso() { # shared object - first try stripping it # in fact, this now handles ALL ELF files S1=`mktemp` || fatal "cannot create a temporary file" S2=`mktemp` || fatal "cannot create a temporary file" cp ${1}${3} $S1 strip --strip-debug $S1 cp ${2}${3} $S2 strip --strip-debug $S2 cmp -s $S1 $S2 if [ $? -eq 0 ]; then message "accept: $3 after --strip-debug" let accepted=$accepted+1 let ACCELF=$ACCELF+1 else tokenizeanddiff $S1 $S2 $3 if [ $? -ne 0 ]; then failure $3 " $3 differs after stripping and processing" else message "accept: $3 after --strip-debug and processing" let accepted=$accepted+1 let ACCELF=$ACCELF+1 fi fi rm $S1 $S2 } function tokenize() { # use regexes to replace date/time/kernel-version text # with tokens which may allow files to match even though # they have hardcoded date/time/kernel-version. # arguments are file to process, and where to put it. # these regexes are somewhat long, and the order they # are applied in is important (to stop short ones being # used when a longer version would match). # KV00 linux version date (e.g. as in the kernel itself) # allow 2 or 3 groups of three alphas here - optional smp, with day, mon # KV01 kernel version, including possible cpu details (that is for cdda2wav) # KV02 just the version, in quotes e.g. "2.6.12.6" or '2.6.13', for perl stuff # except that "|' gives me grif, so try a boundary # also, it might need local version on the end, I really want # quote2.\d+.\d+.{0,32}quote - it is the quotes that don't work. # DT00 Day Mon .d+ hh:mm:ss TZN CCYY variations include non-caps and 'mon d' # DT01 Mon .d+ CCYY hh:mm:ss # DT02 hh:mm:ss Mon .d CCYY # DT03 Mon .d CCYY # DT04 Day Mon { ,d}d hh:mm:ss CCYY - for groff example postscript files # (somewhat similar to DT00, but ' d' or ' dd' for day of month and no TZN ) # DT05 hh:mm:ss # DT06 ISO date using space as separator # DT07 ISO date using dash as separator # DT08 ISO date using slash as separator # DT09 fullmonth (capitalised), day number, comma, 4-digit year (groff 1.18.1 ps) # DT10 dd, fullmonth (capitalised), 4-digit year (groff 1.18.1 manpages) # DT11 '(xample comma space digit(s) backslash ) in groff memef.ps which is # quite clearly the day of the month when it was compiled, preceded by 'example' # with something weird between the e and the x. if [ $# -ne 2 ]; then fatal "tokenizing called with $# arguments : $*" fi cat $1 | perl -p \ -e 's/(L|l)inux.*\d\.\d\.\d+.* \#\d+( [A-Za-z][a-z]{2}){2,3} \d+ \d\d:\d\d:\d\d [A-Za-z]{3} \d{4}\b/%KV00%/g;' \ -e 's/(L|l)inux( (\w|_)+)?(-| |_)\d\.\d(\.\d+){1,2}((-|_)?(\w|_)+)?( |\000)*/%KV01%/g;' \ -e 's/\W2(\.\d+){2,3}(-|_)?((\w|_)+)?\s*\W/%KV02%/g;' \ -e 's/\b([A-Za-z][a-z]{2} ){2}( |\d)?\d \d\d:\d\d:\d\d [A-Za-z]{3} \d{4}\b/%DT00%/g;' \ -e 's/\b[A-Z][a-z]{2} ( |\d)\d \d{4} \d\d:\d\d:\d\d\b/%DT01%/g;' \ -e 's/\b\d\d:\d\d:\d\d [A-Z][a-z]{2} ( |\d)\d \d{4}\b/%DT02%/g;' \ -e 's/\b[A-Z][a-z]{2} ( |\d)\d \d{4}\b/%DT03%/g;' \ -e 's/\b([A-Z][a-z]{2} ){2}( |\d)\d \d\d:\d\d:\d\d \d{4}/%DT04%/g;' \ -e 's/\b\d\d:\d\d:\d\d\b/%DT05%/g;' \ -e 's/\b\d{4} \d\d \d\d\b/%DT06%/g;' \ -e 's/\b\d{4}-\d\d-\d\d\b/%DT07%/g;' \ -e 's/\b\d{4}\/\d\d\/\d\d\b/%DT08%/g;' \ -e 's/\b[A-Z][a-z]{2,} \d{1,2}, \d{4}/%DT09%/g;' \ -e 's/\b\d\d [A-Z][a-z]{2,} \d{4}/%DT10%/g;' \ -e 's/\(xample, \d{1,2}\\\)/%DT11%/g;' \ >$2 } function tokenizeanddiff() { # Call tokenize for the inputs, then compare the results # Input arguments are path/filename for old and new versions # third parm is readable name (filename, or archivename:member) # to help understand what is in the extras output. # - sometimes called for files, but other times called for # members of ar archives extracted into temporary directories #message tokenizeanddiff called for $1 $2 $3 F1=`mktemp` || fatal "cannot create a temporary file" F2=`mktemp` || fatal "cannot create a temporary file" tokenize $1 $F1 tokenize $2 $F2 # actually, cmp is probably more efficient # but for picking up the pieces it will be better to # use diff to see what got through. cmp -s $F1 $F2 TOKENRESULT=$? if [ $TOKENRESULT -ne 0 ]; then echo "failure in $3..." >&6 diff -a $F1 $F2 >&6 rm $F1 $F2 false else # show what we did echo "substitutions for $3" >&8 echo "build one" >&8 diff -a $1 $F1 >&8 echo "build two" >&8 diff -a $2 $F2 >&8 rm $F1 $F2 true fi } function validateargs() { # validate the arguments BAD=0 if ! [ -d $1 ]; then echo "Error: first argument is not a directory" >&2 let BAD=$BAD+1 fi NAME=`basename ${2%%-*}` if [ $NAME != filelist ]; then echo "Error: second argument is not a recognized filelist" >&2 let BAD=$BAD+1 fi if ! [ -d $3 ]; then echo "Error: third argument is not a directory" >&2 let BAD=$BAD+1 fi NAME=`basename ${4%%-*}` if [ $NAME != filelist ]; then echo "Error: fourth argument is not a recognized filelist" >&2 let BAD=$BAD+1 fi for I in $1 $2 $3 $4; do if ! [ -r $I ]; then echo "Error: cannot read $I" >&2 let BAD=$BAD+1 fi done if [ $1 == $3 ]; then echo "Error: directory pathes are identical" >&2 let BAD=$BAD+1 fi if [ $2 == $4 ]; then echo "Error: filelist names are identical" >&2 let BAD=$BAD+1 fi if [ $BAD -eq 0 ]; then ARGS=valid fi } # Mainline ARGS=unproven OUTDIR= if [ $# -eq 1 ]; then case $1 in -version|--version) echo "`basename $0` version $VERSION" exit 0 ;; -help|--help) dohelp exit 0 ;; esac fi if [ $1 = "--directory" ]; then OUTDIR=$2 shift 2 grep '/$' $OUTDIR >/dev/null 2>&1 || OUTDIR=`echo $OUTDIR | sed 's%$%/%'` echo "creating directory $OUTDIR" mkdir -p $OUTDIR if [ $? -ne 0 ]; then echo "cannot mkdir $OUTDIR" exit 1 fi fi if [ $# -eq 4 ]; then validateargs $* fi if ! [ $ARGS == valid ]; then dohelp fatal "`basename $0`: error in arguments" fi # ok, we're happy, lets hit these files exec 5>${OUTDIR}$RESULT exec 6>${OUTDIR}$EXTRAS exec 7>${OUTDIR}$IDENTICAL exec 8>${OUTDIR}$SUBS exec 9>${OUTDIR}$DIFFER >${OUTDIR}$RESULT if [ $? -ne 0 ]; then fatal "cannot write to ${OUTDIR}$RESULT" fi emessage "will compare:" emessage " first build at $1 with files listed in $2" emessage "second build at $3 with files listed in $4" let accepted=0 let different=0 let expected=0 let matched=0 let only=0 let predictable=0 let unreadable=0 let total=0 # break down the accepted let ACCAR=0 let ACCELF=0 let ACCGZ=0 let ACCUNK=0 # break down definitely different let DIFAR=0 let DIFELF=0 let DIFGZ=0 let DIFSYM=0 let DIFUNK=0 # break down the expected differences let EXPAR=0 let EXPELF=0 let EXPGZ=0 let EXPUNK=0 # break down the identical files let MATAR=0 let MATELF=0 let MATGZ=0 let MATSYM=0 let MATUNK=0 # break down how many of each type let TOTAR=0 let TOTELF=0 let TOTGZ=0 let TOTSYM=0 let TOTUNK=0 # now identify differences between the two trees DIFF=`mktemp` || fatal "cannot create a temporary file" diff $2 $4 >$DIFF for RUN in '<' '>' ; do grep -q "$RUN" $DIFF && onlyone "$RUN" done rm $DIFF # and compare them message "Results of file comparison:" message "" # Strip any trailing slash from the path for tidyness, # because the filenames all start with a slash. Unfortunately, # unfortunately, '/' becomes empty, which breaks subroutines, # so special case it. # also, to process ar archives we need to extract them in temp # directories - that means that after cd'ing we've broken any # relative path, so save original pwd as necessary. P1=`echo $1 | sed 's%/$%%'` echo $1 | grep '^/' >/dev/null if [ $? -ne 0 ]; then # relative path OP1=${PWD}/ #echo "setting OP1 to $OP1" else OP1= #echo "$1 is an absolute path" fi test -z "$P1" && P1='/' P2=`echo $3 | sed 's%/$%%'` echo $3 | grep '^/' >/dev/null if [ $? -ne 0 ]; then # relative path OP2=${PWD}/ #echo "setting OP2 to $OP2" else OP2= #echo "$3 is an absolute path" fi test -z "$P2" && P2='/' echo "about to read $2" while read FILE ; do #echo "process $FILE" #echo "test existence of ${P2}${FILE}" # confirm it exists in second build # we have already reported files only in one build if [ -f ${P2}"${FILE}" ]; then let total=$total+1 # check we can read both of them # or count as unreadable - I used to separate only-one-unreadable, # but if you compre '/' and a _copy_ of /mnt/lfs that assumption # breaks, so be less picky. if ! [ -r "${P1}${FILE}" ] || ! [ -r "${P2}${FILE}" ]; then message "cannot read one or both versions of $FILE" let unreadable=$unreadable+1 continue fi if [ -h "${P1}${FILE}" ]; then # for symlink, look at what it points to # exceptionally, do not call filetype TYPE=SYM let TOTSYM=$TOTSYM+1 SL1=`ls -l "${P1}${FILE}" | awk '{ print $11 }'` SL2=`ls -l "${P2}${FILE}" | awk '{ print $11 }'` if [ "$SL1" = "$SL2" ]; then echo "symlink $FILE matches for $SL1" >&5 let matched=$matched+1 let MATSYM=$MATSYM+1 else failure TARGET " symlink $FILE points to $SL1 and $SL2" echo $FILE >&9 fi else # regular file, start by typing it for accounting, # then compare it filetype ${P1}${FILE} cmp -s "${P1}${FILE}" "${P2}${FILE}" if [ $? -eq 0 ]; then let matched=$matched+1 case $TYPE in AR) let MATAR=$MATAR+1 ;; ELF) let MATELF=$MATELF+1 ;; GZ) let MATGZ=$MATGZ+1 ;; UNK) let MATUNK=$MATUNK+1 ;; *) echo "unexpected TYPE of $TYPE for $FILE" >&2 exit 2 ;; esac echo ${FILE} >&7 else # seems different, can we do better ? # test if we expect it to differ expected $FILE if [ $? -ne 0 ]; then case $TYPE in GZ) testgzip $P1 $P2 $FILE ;; AR) testar $P1 $P2 $FILE ;; ELF) testso $P1 $P2 $FILE ;; *) # long-stop - strip dates from text files tokenizeanddiff "${P1}${FILE}" "${P2}${FILE}" "$FILE" if [ $? -eq 0 ]; then message "accepted $FILE after processing" let accepted=$accepted+1 let ACCUNK=$ACCUNK+1 else failure "$FILE" " $FILE is different" fi ;; esac fi fi fi fi done < $2 message "" # write totals to stderr as well as the results file emessage "$only files in only one of the builds" emessage "$total files compared, of which" emessage "$unreadable files could not be read, skipped" emessage "$matched files are identical" emessage "$expected files differed as expected" emessage "$accepted files had allowable differences" #emessage "$predictable files differed as they normally do" emessage "$different files differed" # totals of different file types emessage "" emessage "$TOTAR ar archives" emessage " of which $MATAR are identical" emessage " of which $ACCAR are accepted after strip-debug or extracting, diffing, tokenizing" emessage " of which $EXPAR differed as expected" emessage " of which $DIFAR differed" emessage "$TOTELF ELF executables or shared libraries" emessage " of which $MATELF are identical" emessage " of which $ACCELF are accepted after stripping and tokenizing" emessage " of which $EXPELF differed as expected" emessage " of which $DIFELF differed" emessage "$TOTGZ gzipped files" emessage " of which $MATGZ are identical" emessage " of which $ACCGZ are accepted after comparing beyond timestamp" emessage " of which $DIFGZ are different" emessage "$TOTSYM symbolic links" emessage " of which $MATSYM are identical" emessage " of which $DIFSYM have different targets" emessage "$TOTUNK other files" emessage " of which $MATUNK are identical" emessage " of which $ACCUNK are accepted after tokenizing" emessage " of which $EXPUNK differed as expected" emessage " of which $DIFUNK differed"