001//////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code for adherence to a set of rules. 003// Copyright (C) 2001-2014 Oliver Burn 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018//////////////////////////////////////////////////////////////////////////////// 019package com.puppycrawl.tools.checkstyle.checks.duplicates; 020 021import com.google.common.collect.ArrayListMultimap; 022import com.google.common.collect.Lists; 023import com.google.common.collect.MapMaker; 024import com.google.common.collect.Multimap; 025import com.puppycrawl.tools.checkstyle.api.AbstractFileSetCheck; 026import com.puppycrawl.tools.checkstyle.api.FileText; 027import com.puppycrawl.tools.checkstyle.api.MessageDispatcher; 028import com.puppycrawl.tools.checkstyle.api.Utils; 029import java.io.File; 030import java.io.IOException; 031import java.util.Collection; 032import java.util.List; 033import java.util.Map; 034import org.apache.commons.logging.Log; 035import org.apache.commons.logging.LogFactory; 036 037/** 038 * Performs a line-by-line comparison of all code lines and reports 039 * duplicate code if a sequence of lines differs only in 040 * indentation. All import statements in Java code are ignored, any 041 * other line - including javadoc, whitespace lines between methods, 042 * etc. - is considered (which is why the check is called 043 * <em>strict</em>). 044 * 045 * @author Lars Kühne 046 */ 047public final class StrictDuplicateCodeCheck extends AbstractFileSetCheck 048{ 049 /** 050 * A prime that is used o calculate checksums of lines and blocks of lines. 051 * Important that it's larger than the length of most lines to avoid hash 052 * collisions. 053 * 054 * For a list of primes see 055 * http://www.utm.edu/research/primes/lists/small/1000.txt 056 */ 057 private static final int BIG_PRIME = 317; 058 059 /** 060 * Converts each consecutive block of {@link #mMin} lines of the 061 * original source lines to a checksum that is checked against 062 * to find duplicates. 063 */ 064 private interface ChecksumGenerator 065 { 066 /** 067 * Convert each block of the original source lines 068 * to a checksum that is checked against to find duplicates 069 * 070 * @param aOriginalLines the original lines as they appear 071 * in the source file 072 * 073 * @return an array of (aOriginalLines.length - mMin + 1) checksums 074 */ 075 int[] convertLines(String[] aOriginalLines); 076 } 077 078 079 /** 080 * Calculates checksums for text files. 081 */ 082 private class TextfileChecksumGenerator implements ChecksumGenerator 083 { 084 /** {@inheritDoc} */ 085 public int[] convertLines(String[] aOriginalLines) 086 { 087 final int lineCount = aOriginalLines.length; 088 final long[] checkSums = new long[lineCount]; 089 for (int i = 0; i < lineCount; i++) { 090 final String line = aOriginalLines[i]; 091 checkSums[i] = calcChecksum(line); 092 } 093 final int retLen = Math.max(0, lineCount - mMin + 1); 094 final int[] ret = new int[retLen]; 095 096 for (int i = 0; i < retLen; i++) { 097 int blockChecksum = 0; 098 boolean onlyEmptyLines = true; 099 for (int j = 0; j < mMin; j++) { 100 if (aOriginalLines[i + j].length() > 0) { 101 onlyEmptyLines = false; 102 } 103 final long checksum = checkSums[i + j]; 104 if (checksum == IGNORE) { 105 blockChecksum = IGNORE; 106 break; 107 } 108 blockChecksum += (j + 1) * BIG_PRIME * checksum; 109 } 110 ret[i] = onlyEmptyLines ? IGNORE : blockChecksum; 111 } 112 return ret; 113 } 114 115 /** 116 * Computes a checksum for a a single line of text. 117 * @param aLine the aLine 118 * @return checksum 119 */ 120 protected int calcChecksum(String aLine) 121 { 122 final int hashCode = aLine.hashCode(); 123 if (hashCode == IGNORE) { 124 return Integer.MAX_VALUE / 2; 125 } 126 return hashCode; 127 } 128 } 129 130 /** 131 * A TextfileChecksumGenerator that also ignores imports. 132 */ 133 private class JavaChecksumGenerator extends TextfileChecksumGenerator 134 { 135 // TODO: return IGNORE for lines in the header comment? 136 // That would require some simple parsing... 137 138 // we could also parse the java code using the TreeWalker 139 // and then ignore everything before the CLASS_DEF... 140 141 @Override 142 protected int calcChecksum(String aLine) 143 { 144 // to avoid false alarms it is important that different lines 145 // result in different checksums. 146 if (aLine.startsWith("import ")) { 147 return IGNORE; 148 } 149 return super.calcChecksum(aLine); 150 } 151 } 152 153 /** a jakarta commons log */ 154 private static final Log LOG = 155 LogFactory.getLog(StrictDuplicateCodeCheck.class); 156 157 /** the checksum value to use for lines that should be ignored */ 158 static final int IGNORE = Integer.MIN_VALUE; 159 160 /** default value for mMin */ 161 private static final int DEFAULT_MIN_DUPLICATE_LINES = 12; 162 163 /** number of lines that have to be idential for reporting duplicates */ 164 private int mMin = DEFAULT_MIN_DUPLICATE_LINES; 165 166 /** the basedir to strip off in filenames */ 167 private String mBasedir; 168 169 /** 170 * The checksums of all files that are currently checked. 171 * Dimension one: file index 172 * Dimension two: block start line 173 */ 174 private int[][] mLineBlockChecksums; 175 176 /** 177 * helper to speed up searching algorithm, holds the checksums from 178 * {@link #mLineBlockChecksums} except {@link #IGNORE}, sorted. 179 */ 180 private ChecksumInfo[] mChecksumInfo; 181 182 /** files that are currently checked */ 183 private final List<File> mFiles = Lists.newArrayList(); 184 185 /** 186 * A SoftReference cache for the trimmed lines of a file path. 187 */ 188 private final Map<String, String[]> mTrimmedLineCache = 189 new MapMaker().softValues().makeMap(); 190 191 // fields required only for statistics 192 193 /** total number of duplicates found */ 194 private int mDuplicates; 195 /** the charset used to load files. */ 196 private String mCharset; 197 198 /** Creates a new instance of this class. */ 199 public StrictDuplicateCodeCheck() 200 { 201 } 202 203 /** 204 * Sets the minimum number of lines that must be equivalent 205 * before the check complains. 206 * 207 * @param aMin the number of lines that must be equal before 208 * triggering a 'duplicate code' message. 209 */ 210 public void setMin(int aMin) 211 { 212 if (aMin < 1) { 213 throw new IllegalArgumentException("min must be 1 or higher"); 214 } 215 mMin = aMin; 216 } 217 218 /** @param aBasedir the base directory to strip off in filenames */ 219 public void setBasedir(String aBasedir) 220 { 221 mBasedir = aBasedir; 222 } 223 224 @Override 225 public void beginProcessing(String aCharset) 226 { 227 super.beginProcessing(aCharset); 228 mCharset = aCharset; 229 mFiles.clear(); 230 } 231 232 @Override 233 protected void processFiltered(File aFile, List<String> aLines) 234 { 235 mFiles.add(aFile); 236 } 237 238 @Override 239 public void finishProcessing() 240 { 241 super.finishProcessing(); 242 final long start = System.currentTimeMillis(); 243 mDuplicates = 0; 244 mLineBlockChecksums = new int[mFiles.size()][]; 245 mChecksumInfo = new ChecksumInfo[mFiles.size()]; 246 247 if (LOG.isDebugEnabled()) { 248 LOG.debug("Reading " + mFiles.size() + " input files"); 249 } 250 251 for (int i = 0; i < mFiles.size(); i++) { 252 final File file = mFiles.get(i); 253 try { 254 final String[] lines = getTrimmedLines(file); 255 final ChecksumGenerator transformer = 256 findChecksumGenerator(file); 257 mLineBlockChecksums[i] = transformer.convertLines(lines); 258 } 259 catch (final IOException ex) { 260 LOG.error("Cannot access " + file + " (" 261 + ex.getMessage() + "), ignoring", ex); 262 // TODO: better to throw an exception here? 263 // it would be best to throw IOException from process(), 264 // but interface definition doesn't allow that... 265 mLineBlockChecksums = new int[0][0]; 266 } 267 } 268 fillSortedRelevantChecksums(); 269 270 final long endReading = System.currentTimeMillis(); 271 findDuplicates(); 272 final long endSearching = System.currentTimeMillis(); 273 274 dumpStats(start, endReading, endSearching); 275 276 mLineBlockChecksums = null; 277 mChecksumInfo = null; 278 } 279 280 /** 281 * Finds the Checksum generator for a given file. 282 * 283 * @param aFile the file to check for duplicates 284 * @return a generator to use for aFile 285 */ 286 private ChecksumGenerator findChecksumGenerator(File aFile) 287 { 288 if (aFile.getName().endsWith(".java")) { 289 return new JavaChecksumGenerator(); 290 } 291 // TODO: Not sure what to do with binary files such as gifs 292 return new TextfileChecksumGenerator(); 293 } 294 295 /** 296 * Dump out statistics data on stderr. 297 * @param aStart start time 298 * @param aEndReading end time of reading phsical files 299 * @param aEndSearching end time duplicate analysis 300 */ 301 private void dumpStats(long aStart, long aEndReading, long aEndSearching) 302 { 303 if (LOG.isDebugEnabled()) { 304 final long initTime = aEndReading - aStart; 305 final long workTime = aEndSearching - aEndReading; 306 LOG.debug("files = " + mFiles.size()); 307 LOG.debug("duplicates = " + mDuplicates); 308 LOG.debug("Runtime = " + initTime + " + " + workTime); 309 } 310 } 311 312 /** 313 * filters and sorts the relevant lines and stores the result 314 * in sortedRelevantChecksums during the setup phase. 315 * That data is later used in a binary search to find out 316 * if it is worth investigating a file for duplicates of a block. 317 * If the block's checksum does not occur in the other file 318 * at all, we can skip that file quickly. 319 */ 320 private void fillSortedRelevantChecksums() 321 { 322 for (int i = 0; i < mLineBlockChecksums.length; i++) { 323 final int[] checksums = mLineBlockChecksums[i]; 324 mChecksumInfo[i] = new ChecksumInfo(checksums); 325 } 326 } 327 328 /** 329 * finds duplicate lines in mFiles, 330 * using a textsearch algorithm to find reoccuring 331 * patters in the lineChecksums. 332 */ 333 private void findDuplicates() 334 { 335 if (LOG.isDebugEnabled()) { 336 LOG.debug("Analysis phase"); 337 } 338 339 // It's been a while since my CS degree, but I think this is 340 // somewhere near O(LOC^2). 341 342 // It may be possible to do this *much* smarter, 343 // but I don't have the Knuth bible at hand right now :-) 344 345 final int len = mFiles.size(); 346 for (int i = 0; i < len; i++) { 347 348 final String path = mFiles.get(i).getPath(); 349 getMessageCollector().reset(); 350 final MessageDispatcher dispatcher = getMessageDispatcher(); 351 dispatcher.fireFileStarted(path); 352 353 for (int j = 0; j <= i; j++) { 354 findDuplicatesInFiles(i, j); 355 } 356 357 fireErrors(path); 358 dispatcher.fireFileFinished(path); 359 } 360 } 361 362 /** 363 * Compare two files and search for duplicates. 364 * @param aI mLineChecksums index of the first file to compare 365 * @param aJ mLineChecksums index of the seconds file to compare 366 */ 367 private void findDuplicatesInFiles(int aI, int aJ) 368 { 369 final ChecksumInfo iChecksumInfo = mChecksumInfo[aI]; 370 final ChecksumInfo jChecksumInfo = mChecksumInfo[aJ]; 371 if (!iChecksumInfo.hasChecksumOverlapsWith(jChecksumInfo)) { 372 return; 373 } 374 375 final int[] iLineBlockChecksums = mLineBlockChecksums[aI]; 376 final int iBlockCount = iLineBlockChecksums.length; 377 378 // blocks of duplicate code might be longer than 'min'. We need to 379 // remember the line combinations where we must ignore identical blocks 380 // because we have already reported them for an earlier blockIdx. 381 final Multimap<Integer, Integer> ignorePairs = 382 ArrayListMultimap.create(); 383 384 // go through all the blocks in iFile and 385 // check if the following mMin lines occur in jFile 386 for (int iLine = 0; iLine < iBlockCount; iLine++) { 387 388 final int iSum = iLineBlockChecksums[iLine]; 389 final int[] jLines = jChecksumInfo.findLinesWithChecksum(iSum); 390 // detailed analysis only if the iLine block occurs in jFile at all 391 if (jLines.length > 0) { 392 findDuplicateFromLine(aI, aJ, iLine, jLines, ignorePairs); 393 } 394 } 395 } 396 397 /** 398 * Find and report a duplicate of the code starting from line aILine 399 * in file aI in the file aJ. The caller has already ensured that 400 * there are at least mMax duplicate lines, this method mainly analyzes 401 * how far the block of duplicates extends. 402 * 403 * @param aI index of file that contains the candidate code 404 * @param aJ index of file that is searched for a dup of the candidate 405 * @param aILine starting line of the candidate in aI 406 * @param aJLines lines in file aJ that have the same checksum as aILine 407 * @param aIgnore Bag from iLine to jLines, an entry indicates that 408 * this line i/j-combination has already been reported as part of another 409 * viloation 410 */ 411 private void findDuplicateFromLine( 412 final int aI, final int aJ, final int aILine, 413 final int[] aJLines, final Multimap<Integer, Integer> aIgnore) 414 { 415 // Using something more advanced like Boyer-Moore might be a 416 // good idea... 417 418 final int[] iCheckSums = mLineBlockChecksums[aI]; 419 final int[] jCheckSums = mLineBlockChecksums[aJ]; 420 final long checkSum = iCheckSums[aILine]; 421 422 for (int jLine : aJLines) { 423 424 if (aI == aJ && aILine >= jLine) { 425 continue; 426 } 427 428 if (jCheckSums[jLine] != checkSum) { 429 continue; 430 } 431 432 final Collection<Integer> ignoreEntries = aIgnore.get(aILine); 433 if (ignoreEntries != null && ignoreEntries.contains(jLine)) { 434 continue; 435 } 436 437 final int duplicateLines = 438 verifiyDuplicateLines(aI, aJ, aILine, jLine); 439 if (duplicateLines >= mMin) { 440 reportDuplicate(duplicateLines, aILine, mFiles.get(aJ), jLine); 441 final int extend = duplicateLines - mMin; 442 for (int i = 0; i < extend; i++) { 443 final int offset = (i + 1); 444 aIgnore.put(aILine + offset, jLine + offset); 445 } 446 } 447 } 448 } 449 450 /** 451 * Verifies the number of lines that are equal. 452 * Note that block checksums might be equal for blocks that in fact 453 * are different, so we must check the actual file content again. 454 * 455 * @param aI file index i 456 * @param aJ file index j 457 * @param aIStartLine start line of potential duplicate code in file i 458 * @param aJStartLine start line of potential duplicate code in file j 459 * @return the number of verified equal lines 460 */ 461 private int verifiyDuplicateLines( 462 int aI, int aJ, int aIStartLine, int aJStartLine) 463 { 464 final File iFile = mFiles.get(aI); 465 final File jFile = mFiles.get(aJ); 466 try { 467 final String[] iLines = getTrimmedLines(iFile); 468 final String[] jLines = getTrimmedLines(jFile); 469 470 int verified = 0; 471 int i = aIStartLine; 472 int j = aJStartLine; 473 while (i < iLines.length && j < jLines.length 474 && iLines[i++].equals(jLines[j++])) 475 { 476 verified += 1; 477 } 478 return verified; 479 } 480 catch (IOException ex) { 481 LOG.error("Unable to verify potential duplicate for " 482 + iFile + " and " + jFile, ex); 483 return 0; 484 } 485 } 486 487 488 /** 489 * Returns the trimmed lines of a given file. 490 * Caches the results, so when memory is available 491 * we try to avoid reading the file repeatedly. 492 * 493 * @param aFile the file 494 * @return the lines in aFile after applying {@link String#trim()} 495 * @throws IOException if the file content cannot be read 496 */ 497 private String[] getTrimmedLines(File aFile) throws IOException 498 { 499 final String path = aFile.getPath(); 500 final String[] cachedLines = mTrimmedLineCache.get(path); 501 if (cachedLines != null) { 502 return cachedLines; 503 } 504 final String charset = mCharset; 505 final FileText text = new FileText(aFile, charset); 506 final String[] lines = getTrimmed(text.toLinesArray()); 507 mTrimmedLineCache.put(path, lines); 508 return lines; 509 } 510 511 /** 512 * Applies {@link String#trim()} on each String in a given array. 513 * @param aLines the original Strings 514 * @return the converted Strings after applying {@link String#trim()} 515 */ 516 private String[] getTrimmed(String[] aLines) 517 { 518 final String[] ret = new String[aLines.length]; 519 for (int i = 0; i < ret.length; i++) { 520 ret[i] = aLines[i].trim(); 521 } 522 return ret; 523 } 524 525 /** 526 * Dumps out a duplicate report. 527 * @param aEquivalent number of equivalent lines 528 * @param aILine location of duplicate code 529 * within file that is currently checked 530 * @param aJFile the other file that contains the duplicate 531 * @param aJLine location of duplicate code within aJFile 532 */ 533 private void reportDuplicate( 534 int aEquivalent, int aILine, File aJFile, int aJLine) 535 { 536 final String fileName = 537 Utils.getStrippedFileName(mBasedir, aJFile.getPath()); 538 log(aILine + 1, "duplicates.lines", aEquivalent, fileName, aJLine + 1); 539 mDuplicates += 1; 540 } 541 542}