001: /**
002: * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
003: */package net.sourceforge.pmd.cpd;
004:
005: import java.util.ArrayList;
006: import java.util.Collections;
007: import java.util.HashMap;
008: import java.util.HashSet;
009: import java.util.Iterator;
010: import java.util.List;
011: import java.util.Map;
012: import java.util.Set;
013:
014: public class MatchCollector {
015:
016: private MatchAlgorithm ma;
017: private Map<Match.MatchCode, Match> startMap = new HashMap<Match.MatchCode, Match>();
018: private Map<String, List<Match>> fileMap = new HashMap<String, List<Match>>();
019:
020: public MatchCollector(MatchAlgorithm ma) {
021: this .ma = ma;
022: }
023:
024: public void collect(List<TokenEntry> marks) {
025: //first get a pairwise collection of all maximal matches
026: for (int i = 0; i < marks.size() - 1; i++) {
027: TokenEntry mark1 = marks.get(i);
028: for (int j = i + 1; j < marks.size(); j++) {
029: TokenEntry mark2 = marks.get(j);
030: int diff = mark1.getIndex() - mark2.getIndex();
031: if (-diff < ma.getMinimumTileSize()) {
032: continue;
033: }
034: if (hasPreviousDupe(mark1, mark2)) {
035: continue;
036: }
037:
038: // "match too small" check
039: int dupes = countDuplicateTokens(mark1, mark2);
040: if (dupes < ma.getMinimumTileSize()) {
041: continue;
042: }
043: // is it still too close together
044: if (diff + dupes >= 1) {
045: continue;
046: }
047: determineMatch(mark1, mark2, dupes);
048: }
049: }
050: }
051:
052: public List<Match> getMatches() {
053: List<Match> matchList = new ArrayList<Match>(startMap.values());
054: Collections.sort(matchList);
055: Set<Match.MatchCode> matchSet = new HashSet<Match.MatchCode>();
056: Match.MatchCode matchCode = new Match.MatchCode();
057: for (int i = matchList.size(); i > 1; i--) {
058: Match match1 = matchList.get(i - 1);
059: TokenEntry mark1 = match1.getMarkSet().iterator().next();
060: matchSet.clear();
061: matchSet.add(match1.getMatchCode());
062: for (int j = i - 1; j > 0; j--) {
063: Match match2 = matchList.get(j - 1);
064: if (match1.getTokenCount() != match2.getTokenCount()) {
065: break;
066: }
067: TokenEntry mark2 = null;
068: for (Iterator<TokenEntry> iter = match2.getMarkSet()
069: .iterator(); iter.hasNext();) {
070: mark2 = iter.next();
071: if (mark2 != mark1) {
072: break;
073: }
074: }
075: int dupes = countDuplicateTokens(mark1, mark2);
076: if (dupes < match1.getTokenCount()) {
077: break;
078: }
079: matchSet.add(match2.getMatchCode());
080: match1.getMarkSet().addAll(match2.getMarkSet());
081: matchList.remove(i - 2);
082: i--;
083: }
084: if (matchSet.size() == 1) {
085: continue;
086: }
087: //prune the mark set
088: Set pruned = match1.getMarkSet();
089: boolean done = false;
090: ArrayList<TokenEntry> a1 = new ArrayList<TokenEntry>(match1
091: .getMarkSet());
092: Collections.sort(a1);
093: for (int outer = 0; outer < a1.size() - 1 && !done; outer++) {
094: TokenEntry cmark1 = a1.get(outer);
095: for (int inner = outer + 1; inner < a1.size() && !done; inner++) {
096: TokenEntry cmark2 = a1.get(inner);
097: matchCode.setFirst(cmark1.getIndex());
098: matchCode.setSecond(cmark2.getIndex());
099: if (!matchSet.contains(matchCode)) {
100: if (pruned.size() > 2) {
101: pruned.remove(cmark2);
102: }
103: if (pruned.size() == 2) {
104: done = true;
105: }
106: }
107: }
108: }
109: }
110: return matchList;
111: }
112:
113: /**
114: * A greedy algorithm for determining non-overlapping matches
115: */
116: private void determineMatch(TokenEntry mark1, TokenEntry mark2,
117: int dupes) {
118: Match match = new Match(dupes, mark1, mark2);
119: String fileKey = mark1.getTokenSrcID() + mark2.getTokenSrcID();
120: List<Match> pairMatches = fileMap.get(fileKey);
121: if (pairMatches == null) {
122: pairMatches = new ArrayList<Match>();
123: fileMap.put(fileKey, pairMatches);
124: }
125: boolean add = true;
126: for (int i = 0; i < pairMatches.size(); i++) {
127: Match other = pairMatches.get(i);
128: if (other.getFirstMark().getIndex() + other.getTokenCount()
129: - mark1.getIndex() > 0) {
130: boolean ordered = other.getSecondMark().getIndex()
131: - mark2.getIndex() < 0;
132: if ((ordered && (other.getEndIndex() - mark2.getIndex() > 0))
133: || (!ordered && (match.getEndIndex() - other
134: .getSecondMark().getIndex()) > 0)) {
135: if (other.getTokenCount() >= match.getTokenCount()) {
136: add = false;
137: break;
138: } else {
139: pairMatches.remove(i);
140: startMap.remove(other.getMatchCode());
141: }
142: }
143: }
144: }
145: if (add) {
146: pairMatches.add(match);
147: startMap.put(match.getMatchCode(), match);
148: }
149: }
150:
151: private boolean hasPreviousDupe(TokenEntry mark1, TokenEntry mark2) {
152: if (mark1.getIndex() == 0) {
153: return false;
154: }
155: return !matchEnded(ma.tokenAt(-1, mark1), ma.tokenAt(-1, mark2));
156: }
157:
158: private int countDuplicateTokens(TokenEntry mark1, TokenEntry mark2) {
159: int index = 0;
160: while (!matchEnded(ma.tokenAt(index, mark1), ma.tokenAt(index,
161: mark2))) {
162: index++;
163: }
164: return index;
165: }
166:
167: private boolean matchEnded(TokenEntry token1, TokenEntry token2) {
168: return token1.getIdentifier() != token2.getIdentifier()
169: || token1 == TokenEntry.EOF || token2 == TokenEntry.EOF;
170: }
171: }
|