Coverage Report - com.puppycrawl.tools.checkstyle.checks.duplicates.StrictDuplicateCodeCheck
 
Classes in this File Line Coverage Branch Coverage Complexity
StrictDuplicateCodeCheck
87%
116/132
78%
39/50
3.1
StrictDuplicateCodeCheck$1
N/A
N/A
3.1
StrictDuplicateCodeCheck$ChecksumGenerator
N/A
N/A
3.1
StrictDuplicateCodeCheck$JavaChecksumGenerator
100%
4/4
100%
2/2
3.1
StrictDuplicateCodeCheck$TextfileChecksumGenerator
96%
24/25
92%
13/14
3.1
 
 1  
 ////////////////////////////////////////////////////////////////////////////////
 2  
 // checkstyle: Checks Java source code for adherence to a set of rules.
 3  
 // Copyright (C) 2001-2014  Oliver Burn
 4  
 //
 5  
 // This library is free software; you can redistribute it and/or
 6  
 // modify it under the terms of the GNU Lesser General Public
 7  
 // License as published by the Free Software Foundation; either
 8  
 // version 2.1 of the License, or (at your option) any later version.
 9  
 //
 10  
 // This library is distributed in the hope that it will be useful,
 11  
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 12  
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13  
 // Lesser General Public License for more details.
 14  
 //
 15  
 // You should have received a copy of the GNU Lesser General Public
 16  
 // License along with this library; if not, write to the Free Software
 17  
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 18  
 ////////////////////////////////////////////////////////////////////////////////
 19  
 package com.puppycrawl.tools.checkstyle.checks.duplicates;
 20  
 
 21  
 import com.google.common.collect.ArrayListMultimap;
 22  
 import com.google.common.collect.Lists;
 23  
 import com.google.common.collect.MapMaker;
 24  
 import com.google.common.collect.Multimap;
 25  
 import com.puppycrawl.tools.checkstyle.api.AbstractFileSetCheck;
 26  
 import com.puppycrawl.tools.checkstyle.api.FileText;
 27  
 import com.puppycrawl.tools.checkstyle.api.MessageDispatcher;
 28  
 import com.puppycrawl.tools.checkstyle.api.Utils;
 29  
 import java.io.File;
 30  
 import java.io.IOException;
 31  
 import java.util.Collection;
 32  
 import java.util.List;
 33  
 import java.util.Map;
 34  
 import org.apache.commons.logging.Log;
 35  
 import org.apache.commons.logging.LogFactory;
 36  
 
 37  
 /**
 38  
  * Performs a line-by-line comparison of all code lines and reports
 39  
  * duplicate code if a sequence of lines differs only in
 40  
  * indentation.  All import statements in Java code are ignored, any
 41  
  * other line - including javadoc, whitespace lines between methods,
 42  
  * etc. - is considered (which is why the check is called
 43  
  * <em>strict</em>).
 44  
  *
 45  
  * @author Lars K&uuml;hne
 46  
  */
 47  1035
 public final class StrictDuplicateCodeCheck extends AbstractFileSetCheck
 48  
 {
 49  
     /**
 50  
      * A prime that is used o calculate checksums of lines and blocks of lines.
 51  
      * Important that it's larger than the length of most lines to avoid hash
 52  
      * collisions.
 53  
      *
 54  
      * For a list of primes see
 55  
      * http://www.utm.edu/research/primes/lists/small/1000.txt
 56  
      */
 57  
     private static final int BIG_PRIME = 317;
 58  
 
 59  
     /**
 60  
      * Converts each consecutive block of {@link #mMin} lines of the
 61  
      * original source lines to a checksum that is checked against
 62  
      * to find duplicates.
 63  
      */
 64  
     private interface ChecksumGenerator
 65  
     {
 66  
         /**
 67  
          * Convert each block of the original source lines
 68  
          * to a checksum that is checked against to find duplicates
 69  
          *
 70  
          * @param aOriginalLines the original lines as they appear
 71  
          * in the source file
 72  
          *
 73  
          * @return an array of (aOriginalLines.length - mMin + 1) checksums
 74  
          */
 75  
         int[] convertLines(String[] aOriginalLines);
 76  
     }
 77  
 
 78  
 
 79  
     /**
 80  
      * Calculates checksums for text files.
 81  
      */
 82  12
     private class TextfileChecksumGenerator implements ChecksumGenerator
 83  
     {
 84  
         /** {@inheritDoc} */
 85  
         public int[] convertLines(String[] aOriginalLines)
 86  
         {
 87  6
             final int lineCount = aOriginalLines.length;
 88  6
             final long[] checkSums = new long[lineCount];
 89  149
             for (int i = 0; i < lineCount; i++) {
 90  143
                 final String line = aOriginalLines[i];
 91  143
                 checkSums[i] = calcChecksum(line);
 92  
             }
 93  6
             final int retLen = Math.max(0, lineCount - mMin + 1);
 94  6
             final int[] ret = new int[retLen];
 95  
 
 96  117
             for (int i = 0; i < retLen; i++) {
 97  111
                 int blockChecksum = 0;
 98  111
                 boolean onlyEmptyLines = true;
 99  1029
                 for (int j = 0; j < mMin; j++) {
 100  924
                     if (aOriginalLines[i + j].length() > 0) {
 101  614
                         onlyEmptyLines = false;
 102  
                     }
 103  924
                     final long checksum = checkSums[i + j];
 104  924
                     if (checksum == IGNORE) {
 105  6
                         blockChecksum = IGNORE;
 106  6
                         break;
 107  
                     }
 108  918
                     blockChecksum += (j + 1) * BIG_PRIME * checksum;
 109  
                 }
 110  111
                 ret[i] = onlyEmptyLines ? IGNORE : blockChecksum;
 111  
             }
 112  6
             return ret;
 113  
         }
 114  
 
 115  
         /**
 116  
          * Computes a checksum for a a single line of text.
 117  
          * @param aLine the aLine
 118  
          * @return checksum
 119  
          */
 120  
         protected int calcChecksum(String aLine)
 121  
         {
 122  137
             final int hashCode = aLine.hashCode();
 123  137
             if (hashCode == IGNORE) {
 124  0
                 return Integer.MAX_VALUE / 2;
 125  
             }
 126  137
             return hashCode;
 127  
         }
 128  
     }
 129  
 
 130  
     /**
 131  
      * A TextfileChecksumGenerator that also ignores imports.
 132  
      */
 133  12
     private class JavaChecksumGenerator extends TextfileChecksumGenerator
 134  
     {
 135  
         // TODO: return IGNORE for lines in the header comment?
 136  
         // That would require some simple parsing...
 137  
 
 138  
         // we could also parse the java code using the TreeWalker
 139  
         // and then ignore everything before the CLASS_DEF...
 140  
 
 141  
         @Override
 142  
         protected int calcChecksum(String aLine)
 143  
         {
 144  
             // to avoid false alarms it is important that different lines
 145  
             // result in different checksums.
 146  143
             if (aLine.startsWith("import ")) {
 147  6
                 return IGNORE;
 148  
             }
 149  137
             return super.calcChecksum(aLine);
 150  
         }
 151  
     }
 152  
 
 153  
     /** a jakarta commons log */
 154  1
     private static final Log LOG =
 155  
             LogFactory.getLog(StrictDuplicateCodeCheck.class);
 156  
 
 157  
     /** the checksum value to use for lines that should be ignored */
 158  
     static final int IGNORE = Integer.MIN_VALUE;
 159  
 
 160  
     /** default value for mMin */
 161  
     private static final int DEFAULT_MIN_DUPLICATE_LINES = 12;
 162  
 
 163  
     /** number of lines that have to be idential for reporting duplicates */
 164  4
     private int mMin = DEFAULT_MIN_DUPLICATE_LINES;
 165  
 
 166  
     /** the basedir to strip off in filenames */
 167  
     private String mBasedir;
 168  
 
 169  
     /**
 170  
      * The checksums of all files that are currently checked.
 171  
      * Dimension one: file index
 172  
      * Dimension two: block start line
 173  
      */
 174  
     private int[][] mLineBlockChecksums;
 175  
 
 176  
     /**
 177  
      * helper to speed up searching algorithm, holds the checksums from
 178  
      * {@link #mLineBlockChecksums} except {@link #IGNORE}, sorted.
 179  
      */
 180  
     private ChecksumInfo[] mChecksumInfo;
 181  
 
 182  
     /** files that are currently checked */
 183  4
     private final List<File> mFiles = Lists.newArrayList();
 184  
 
 185  
     /**
 186  
      * A SoftReference cache for the trimmed lines of a file path.
 187  
      */
 188  4
     private final Map<String, String[]> mTrimmedLineCache =
 189  
         new MapMaker().softValues().makeMap();
 190  
 
 191  
     // fields required only for statistics
 192  
 
 193  
     /** total number of duplicates found */
 194  
     private int mDuplicates;
 195  
     /** the charset used to load files. */
 196  
     private String mCharset;
 197  
 
 198  
     /** Creates a new instance of this class. */
 199  
     public StrictDuplicateCodeCheck()
 200  4
     {
 201  4
     }
 202  
 
 203  
     /**
 204  
      * Sets the minimum number of lines that must be equivalent
 205  
      * before the check complains.
 206  
      *
 207  
      * @param aMin the number of lines that must be equal before
 208  
      * triggering a 'duplicate code' message.
 209  
      */
 210  
     public void setMin(int aMin)
 211  
     {
 212  2
         if (aMin < 1) {
 213  0
             throw new IllegalArgumentException("min must be 1 or higher");
 214  
         }
 215  2
         mMin = aMin;
 216  2
     }
 217  
 
 218  
     /** @param aBasedir the base directory to strip off in filenames */
 219  
     public void setBasedir(String aBasedir)
 220  
     {
 221  4
         mBasedir = aBasedir;
 222  4
     }
 223  
 
 224  
     @Override
 225  
     public void beginProcessing(String aCharset)
 226  
     {
 227  4
         super.beginProcessing(aCharset);
 228  4
         mCharset = aCharset;
 229  4
         mFiles.clear();
 230  4
     }
 231  
 
 232  
     @Override
 233  
     protected void processFiltered(File aFile, List<String> aLines)
 234  
     {
 235  6
         mFiles.add(aFile);
 236  6
     }
 237  
 
 238  
     @Override
 239  
     public void finishProcessing()
 240  
     {
 241  4
         super.finishProcessing();
 242  4
         final long start = System.currentTimeMillis();
 243  4
         mDuplicates = 0;
 244  4
         mLineBlockChecksums = new int[mFiles.size()][];
 245  4
         mChecksumInfo = new ChecksumInfo[mFiles.size()];
 246  
 
 247  4
         if (LOG.isDebugEnabled()) {
 248  0
             LOG.debug("Reading " + mFiles.size() + " input files");
 249  
         }
 250  
 
 251  10
         for (int i = 0; i < mFiles.size(); i++) {
 252  6
             final File file = mFiles.get(i);
 253  
             try {
 254  6
                 final String[] lines = getTrimmedLines(file);
 255  6
                 final ChecksumGenerator transformer =
 256  
                     findChecksumGenerator(file);
 257  6
                 mLineBlockChecksums[i] = transformer.convertLines(lines);
 258  
             }
 259  0
             catch (final IOException ex) {
 260  0
                 LOG.error("Cannot access " + file + " ("
 261  
                           + ex.getMessage() + "), ignoring", ex);
 262  
                 // TODO: better to throw an exception here?
 263  
                 // it would be best to throw IOException from process(),
 264  
                 // but interface definition doesn't allow that...
 265  0
                 mLineBlockChecksums = new int[0][0];
 266  6
             }
 267  
         }
 268  4
         fillSortedRelevantChecksums();
 269  
 
 270  4
         final long endReading = System.currentTimeMillis();
 271  4
         findDuplicates();
 272  4
         final long endSearching = System.currentTimeMillis();
 273  
 
 274  4
         dumpStats(start, endReading, endSearching);
 275  
 
 276  4
         mLineBlockChecksums = null;
 277  4
         mChecksumInfo = null;
 278  4
     }
 279  
 
 280  
     /**
 281  
      * Finds the Checksum generator for a given file.
 282  
      *
 283  
      * @param aFile the file to check for duplicates
 284  
      * @return a generator to use for aFile
 285  
      */
 286  
     private ChecksumGenerator findChecksumGenerator(File aFile)
 287  
     {
 288  6
         if (aFile.getName().endsWith(".java")) {
 289  6
             return new JavaChecksumGenerator();
 290  
         }
 291  
         // TODO: Not sure what to do with binary files such as gifs
 292  0
         return new TextfileChecksumGenerator();
 293  
     }
 294  
 
 295  
     /**
 296  
      * Dump out statistics data on stderr.
 297  
      * @param aStart start time
 298  
      * @param aEndReading end time of reading phsical files
 299  
      * @param aEndSearching end time duplicate analysis
 300  
      */
 301  
     private void dumpStats(long aStart, long aEndReading, long aEndSearching)
 302  
     {
 303  4
         if (LOG.isDebugEnabled()) {
 304  0
             final long initTime = aEndReading - aStart;
 305  0
             final long workTime = aEndSearching - aEndReading;
 306  0
             LOG.debug("files = " + mFiles.size());
 307  0
             LOG.debug("duplicates = " + mDuplicates);
 308  0
             LOG.debug("Runtime = " + initTime + " + " + workTime);
 309  
         }
 310  4
     }
 311  
 
 312  
     /**
 313  
      * filters and sorts the relevant lines and stores the result
 314  
      * in sortedRelevantChecksums during the setup phase.
 315  
      * That data is later used in a binary search to find out
 316  
      * if it is worth investigating a file for duplicates of a block.
 317  
      * If the block's checksum does not occur in the other file
 318  
      * at all, we can skip that file quickly.
 319  
      */
 320  
     private void fillSortedRelevantChecksums()
 321  
     {
 322  10
         for (int i = 0; i < mLineBlockChecksums.length; i++) {
 323  6
             final int[] checksums = mLineBlockChecksums[i];
 324  6
             mChecksumInfo[i] = new ChecksumInfo(checksums);
 325  
         }
 326  4
     }
 327  
 
 328  
     /**
 329  
      * finds duplicate lines in mFiles,
 330  
      * using a textsearch algorithm to find reoccuring
 331  
      * patters in the lineChecksums.
 332  
      */
 333  
     private void findDuplicates()
 334  
     {
 335  4
         if (LOG.isDebugEnabled()) {
 336  0
             LOG.debug("Analysis phase");
 337  
         }
 338  
 
 339  
         // It's been a while since my CS degree, but I think this is
 340  
         // somewhere near O(LOC^2).
 341  
 
 342  
         // It may be possible to do this *much* smarter,
 343  
         // but I don't have the Knuth bible at hand right now :-)
 344  
 
 345  4
         final int len = mFiles.size();
 346  10
         for (int i = 0; i < len; i++) {
 347  
 
 348  6
             final String path = mFiles.get(i).getPath();
 349  6
             getMessageCollector().reset();
 350  6
             final MessageDispatcher dispatcher = getMessageDispatcher();
 351  6
             dispatcher.fireFileStarted(path);
 352  
 
 353  14
             for (int j = 0; j <= i; j++) {
 354  8
                 findDuplicatesInFiles(i, j);
 355  
             }
 356  
 
 357  6
             fireErrors(path);
 358  6
             dispatcher.fireFileFinished(path);
 359  
         }
 360  4
     }
 361  
 
 362  
     /**
 363  
      * Compare two files and search for duplicates.
 364  
      * @param aI mLineChecksums index of the first file to compare
 365  
      * @param aJ mLineChecksums index of the seconds file to compare
 366  
      */
 367  
     private void findDuplicatesInFiles(int aI, int aJ)
 368  
     {
 369  8
         final ChecksumInfo iChecksumInfo = mChecksumInfo[aI];
 370  8
         final ChecksumInfo jChecksumInfo = mChecksumInfo[aJ];
 371  8
         if (!iChecksumInfo.hasChecksumOverlapsWith(jChecksumInfo)) {
 372  3
             return;
 373  
         }
 374  
 
 375  5
         final int[] iLineBlockChecksums = mLineBlockChecksums[aI];
 376  5
         final int iBlockCount = iLineBlockChecksums.length;
 377  
 
 378  
         // blocks of duplicate code might be longer than 'min'. We need to
 379  
         // remember the line combinations where we must ignore identical blocks
 380  
         // because we have already reported them for an earlier blockIdx.
 381  5
         final Multimap<Integer, Integer> ignorePairs =
 382  
             ArrayListMultimap.create();
 383  
 
 384  
         // go through all the blocks in iFile and
 385  
         // check if the following mMin lines occur in jFile
 386  116
         for (int iLine = 0; iLine < iBlockCount; iLine++) {
 387  
 
 388  111
             final int iSum = iLineBlockChecksums[iLine];
 389  111
             final int[] jLines = jChecksumInfo.findLinesWithChecksum(iSum);
 390  
             // detailed analysis only if the iLine block occurs in jFile at all
 391  111
             if (jLines.length > 0) {
 392  98
                 findDuplicateFromLine(aI, aJ, iLine, jLines, ignorePairs);
 393  
             }
 394  
         }
 395  5
     }
 396  
 
 397  
     /**
 398  
      * Find and report a duplicate of the code starting from line aILine
 399  
      * in file aI in the file aJ. The caller has already ensured that
 400  
      * there are at least mMax duplicate lines, this method mainly analyzes
 401  
      * how far the block of duplicates extends.
 402  
      *
 403  
      * @param aI index of file that contains the candidate code
 404  
      * @param aJ index of file that is searched for a dup of the candidate
 405  
      * @param aILine starting line of the candidate in aI
 406  
      * @param aJLines lines in file aJ that have the same checksum as aILine
 407  
      * @param aIgnore Bag from iLine to jLines, an entry indicates that
 408  
      * this line i/j-combination has already been reported as part of another
 409  
      * viloation
 410  
      */
 411  
     private void findDuplicateFromLine(
 412  
         final int aI, final int aJ, final int aILine,
 413  
         final int[] aJLines, final Multimap<Integer, Integer> aIgnore)
 414  
     {
 415  
         // Using something more advanced like Boyer-Moore might be a
 416  
         // good idea...
 417  
 
 418  98
         final int[] iCheckSums = mLineBlockChecksums[aI];
 419  98
         final int[] jCheckSums = mLineBlockChecksums[aJ];
 420  98
         final long checkSum = iCheckSums[aILine];
 421  
 
 422  220
         for (int jLine : aJLines) {
 423  
 
 424  122
             if (aI == aJ && aILine >= jLine) {
 425  110
                 continue;
 426  
             }
 427  
 
 428  12
             if (jCheckSums[jLine] != checkSum) {
 429  0
                 continue;
 430  
             }
 431  
 
 432  12
             final Collection<Integer> ignoreEntries = aIgnore.get(aILine);
 433  12
             if (ignoreEntries != null && ignoreEntries.contains(jLine)) {
 434  6
                 continue;
 435  
             }
 436  
 
 437  6
             final int duplicateLines =
 438  
                 verifiyDuplicateLines(aI, aJ, aILine, jLine);
 439  6
             if (duplicateLines >= mMin) {
 440  6
                 reportDuplicate(duplicateLines, aILine, mFiles.get(aJ), jLine);
 441  6
                 final int extend = duplicateLines - mMin;
 442  12
                 for (int i = 0; i < extend; i++) {
 443  6
                     final int offset = (i + 1);
 444  6
                     aIgnore.put(aILine + offset, jLine + offset);
 445  
                 }
 446  
             }
 447  
         }
 448  98
     }
 449  
 
 450  
     /**
 451  
      * Verifies the number of lines that are equal.
 452  
      * Note that block checksums might be equal for blocks that in fact
 453  
      * are different, so we must check the actual file content again.
 454  
      *
 455  
      * @param aI file index i
 456  
      * @param aJ file index j
 457  
      * @param aIStartLine start line of potential duplicate code in file i
 458  
      * @param aJStartLine start line of potential duplicate code in file j
 459  
      * @return the number of verified equal lines
 460  
      */
 461  
     private int verifiyDuplicateLines(
 462  
         int aI, int aJ, int aIStartLine, int aJStartLine)
 463  
     {
 464  6
         final File iFile = mFiles.get(aI);
 465  6
         final File jFile = mFiles.get(aJ);
 466  
         try {
 467  6
             final String[] iLines = getTrimmedLines(iFile);
 468  6
             final String[] jLines = getTrimmedLines(jFile);
 469  
 
 470  6
             int verified = 0;
 471  6
             int i = aIStartLine;
 472  6
             int j = aJStartLine;
 473  
             while (i < iLines.length && j < jLines.length
 474  39
                 && iLines[i++].equals(jLines[j++]))
 475  
             {
 476  33
                 verified += 1;
 477  
             }
 478  6
             return verified;
 479  
         }
 480  0
         catch (IOException ex) {
 481  0
             LOG.error("Unable to verify potential duplicate for "
 482  
                 + iFile + " and " + jFile, ex);
 483  0
             return 0;
 484  
         }
 485  
     }
 486  
 
 487  
 
 488  
     /**
 489  
      * Returns the trimmed lines of a given file.
 490  
      * Caches the results, so when memory is available
 491  
      * we try to avoid reading the file repeatedly.
 492  
      *
 493  
      * @param aFile the file
 494  
      * @return the lines in aFile after applying {@link String#trim()}
 495  
      * @throws IOException if the file content cannot be read
 496  
      */
 497  
     private String[] getTrimmedLines(File aFile) throws IOException
 498  
     {
 499  18
         final String path = aFile.getPath();
 500  18
         final String[] cachedLines = mTrimmedLineCache.get(path);
 501  18
         if (cachedLines != null) {
 502  12
             return cachedLines;
 503  
         }
 504  6
         final String charset = mCharset;
 505  6
         final FileText text = new FileText(aFile, charset);
 506  6
         final String[] lines = getTrimmed(text.toLinesArray());
 507  6
         mTrimmedLineCache.put(path, lines);
 508  6
         return lines;
 509  
     }
 510  
 
 511  
     /**
 512  
      * Applies {@link String#trim()} on each String in a given array.
 513  
      * @param aLines the original Strings
 514  
      * @return the converted Strings after applying {@link String#trim()}
 515  
      */
 516  
     private String[] getTrimmed(String[] aLines)
 517  
     {
 518  6
         final String[] ret = new String[aLines.length];
 519  149
         for (int i = 0; i < ret.length; i++) {
 520  143
             ret[i] = aLines[i].trim();
 521  
         }
 522  6
         return ret;
 523  
     }
 524  
 
 525  
     /**
 526  
      * Dumps out a duplicate report.
 527  
      * @param aEquivalent number of equivalent lines
 528  
      * @param aILine location of duplicate code
 529  
      * within file that is currently checked
 530  
      * @param aJFile the other file that contains the duplicate
 531  
      * @param aJLine location of duplicate code within aJFile
 532  
      */
 533  
     private void reportDuplicate(
 534  
             int aEquivalent, int aILine, File aJFile, int aJLine)
 535  
     {
 536  6
         final String fileName =
 537  
                 Utils.getStrippedFileName(mBasedir, aJFile.getPath());
 538  6
         log(aILine + 1, "duplicates.lines", aEquivalent, fileName, aJLine + 1);
 539  6
         mDuplicates += 1;
 540  6
     }
 541  
 
 542  
 }