337 lines
9.0 KiB
Java
337 lines
9.0 KiB
Java
package strategies;
|
|
|
|
import java.io.File;
|
|
import java.util.ArrayList;
|
|
import java.util.Collections;
|
|
import java.util.Comparator;
|
|
import java.util.HashMap;
|
|
import java.util.HashSet;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Set;
|
|
import java.util.function.Predicate;
|
|
|
|
import org.snt.inmemantlr.tree.Ast;
|
|
import org.snt.inmemantlr.tree.AstNode;
|
|
|
|
import cppparser.Grammar;
|
|
|
|
public class BasicAlgorithm {
|
|
private Map<Ast, String> treeMap=new HashMap<>();
|
|
private Map<Integer,List<Ast>> bucketMap=new HashMap<>();
|
|
private Map<String, Map<AstNode,Integer>> initialMap=new HashMap<>();
|
|
private Map<String, Double> matchPart=new HashMap<>();
|
|
/**
|
|
* getBucket(int nodeCount) calculates the bucket number for the astTree
|
|
* @param nodeCount number of ast nodes in a subtree
|
|
* @return
|
|
*/
|
|
public int getBucket(int nodeCount)
|
|
{
|
|
int bucket=0;
|
|
|
|
if(nodeCount<10)
|
|
{
|
|
bucket=nodeCount;
|
|
}
|
|
else
|
|
{
|
|
double bucket1=nodeCount*0.1;
|
|
bucket=(int)bucket1;
|
|
}
|
|
return bucket;
|
|
}
|
|
/**
|
|
*
|
|
* @param bucketNumber it represents the bucket in which subtree is present
|
|
* @param subAst it gives the list of subtrees
|
|
* @param list it represents a list of Asts
|
|
*/
|
|
public Map<Integer,List<Ast>> populateBucketMap(int bucketNumber, Set<Ast> subAst, List<Ast> list, String fileName)
|
|
{
|
|
for(Ast a :subAst)
|
|
{
|
|
int hash=a.hashCode()%bucketNumber;
|
|
if(bucketMap.containsKey(hash))
|
|
{
|
|
list=bucketMap.get(hash);
|
|
list.add(a);
|
|
bucketMap.put(hash, list);
|
|
}
|
|
else
|
|
{
|
|
list=new ArrayList<>();
|
|
list.add(a);
|
|
bucketMap.put(hash, list);
|
|
}
|
|
treeMap.put(a, fileName);
|
|
}
|
|
return bucketMap;
|
|
}
|
|
/**
|
|
*
|
|
* @param ok it is a list of C files to be given as input
|
|
* @param gfile it is a syntax parser file for C+
|
|
*/
|
|
public Map<String, Map<AstNode,Integer>> populateInitialMatch(Map<String, Ast> map)
|
|
{
|
|
List<Ast> list=new ArrayList<>();
|
|
Map<AstNode,Integer> m;
|
|
/**
|
|
* Using hashing to classify subtrees to buckets
|
|
*/
|
|
for(Map.Entry<String, Ast> entry: map.entrySet())
|
|
{
|
|
m=new HashMap<>();
|
|
Predicate<AstNode> pred=p->p.getChildren()!=null;
|
|
int n=entry.getValue().getNodes().size();
|
|
int bucketNumber=getBucket(n);
|
|
Set<Ast> subAst=entry.getValue().getSubtrees(pred);
|
|
for(AstNode n1 : entry.getValue().getNodes())
|
|
{
|
|
m.put(n1, 0);
|
|
}
|
|
initialMap.put(entry.getKey(),m);
|
|
bucketMap=populateBucketMap(bucketNumber,subAst, list, entry.getKey());
|
|
}
|
|
return initialMap;
|
|
}
|
|
|
|
|
|
/**
|
|
* This method populates result for all matching Ast subtrees
|
|
* @param firstAst
|
|
* @param secondAst
|
|
* @param firstFile
|
|
* @param secondFile
|
|
* @param result
|
|
*/
|
|
public void populateResult(Ast firstAst, Ast secondAst, List<Ast> firstFile, List<Ast> secondFile, Map<String, List<Ast>> result)
|
|
{
|
|
if(result.containsKey(treeMap.get(firstAst)))
|
|
{
|
|
firstFile=result.get(treeMap.get(firstAst));
|
|
firstFile.add(firstAst);
|
|
result.put(treeMap.get(firstAst),firstFile);
|
|
}
|
|
else
|
|
{
|
|
firstFile=new ArrayList<>();
|
|
firstFile.add(firstAst);
|
|
result.put(treeMap.get(firstAst),firstFile);
|
|
}
|
|
|
|
if(result.containsKey(treeMap.get(secondAst)))
|
|
{
|
|
secondFile=result.get(treeMap.get(secondAst));
|
|
secondFile.add(secondAst);
|
|
result.put(treeMap.get(secondAst),secondFile);
|
|
}
|
|
else
|
|
{
|
|
secondFile=new ArrayList<>();
|
|
secondFile.add(secondAst);
|
|
result.put(treeMap.get(secondAst),secondFile);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* This algorithm calculates the subtree match
|
|
* @param map it has file names and corresponding AST
|
|
* @param initialMap it contains fileNames with corresponding AST's and their match count
|
|
* @param bucketMap it contains bucket number and AST lists
|
|
* @param treeMap it contains tree/subtrees with corresponding fileName
|
|
*/
|
|
public Map<List<String>,List<Double>> basicPlagiarism(File[] files, File cppConfigFile)
|
|
{
|
|
Map<String, Ast> map=Grammar.runAndGetAsts(files, cppConfigFile);
|
|
initialMap=populateInitialMatch(map);
|
|
List<String> pairs=new ArrayList<>();
|
|
Map<String, List<Ast>> result=new HashMap<>();
|
|
Map<String, Integer> counter=new HashMap<>();
|
|
Map<List<String>, Map<String, List<Ast>>> finalResult=new HashMap<>();
|
|
List<Ast> firstFile=new ArrayList<>();
|
|
List<Ast> secondFile=new ArrayList<>();
|
|
Map<List<String>,List<Double>> percentResult=new HashMap<>();
|
|
/**
|
|
* Initialize counter
|
|
*/
|
|
for(Map.Entry<String, Ast> k:map.entrySet())
|
|
{
|
|
counter.put(k.getKey(), 0);
|
|
}
|
|
for(Map.Entry<Integer, List<Ast>> entry: bucketMap.entrySet())
|
|
{
|
|
if(entry.getValue().size()>=2)
|
|
{
|
|
List<Ast> newList=new ArrayList<>(entry.getValue());
|
|
for(Ast firstAst: entry.getValue())
|
|
{
|
|
newList.remove(firstAst);
|
|
for(Ast secondAst:newList)
|
|
{
|
|
int num1=firstAst.getNodes().size();
|
|
int count1=0;
|
|
if(secondAst.getNodes().size()==num1 && treeMap.containsKey(secondAst) && treeMap.containsKey(firstAst)
|
|
&& (treeMap.get(secondAst)!=treeMap.get(firstAst)))
|
|
{
|
|
|
|
count1=getSimilarSubtrees(firstAst,secondAst, count1,counter,pairs);
|
|
}
|
|
|
|
if(num1==count1)
|
|
{
|
|
populateResult(firstAst,secondAst,firstFile, secondFile,result);
|
|
finalResult.put(pairs,result);
|
|
calculateResult(finalResult,map,percentResult);
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return percentResult;
|
|
}
|
|
/**
|
|
*
|
|
* @param firstAst
|
|
* @param secondAst
|
|
* @param count1
|
|
* @param counter
|
|
* @param pairs
|
|
* @return
|
|
*/
|
|
public int getSimilarSubtrees(Ast firstAst, Ast secondAst, int count1,Map<String, Integer> counter,List<String> pairs)
|
|
{
|
|
for(AstNode node: firstAst.getNodes())
|
|
{
|
|
for(AstNode s1:secondAst.getNodes())
|
|
{
|
|
if( s1.toString().replaceAll("^[0-9]+", "").equals( node.toString().replaceAll("^[0-9]+", "")))
|
|
{
|
|
Map<AstNode, Integer> n=initialMap.get(treeMap.get(secondAst));
|
|
count1++;
|
|
if(n.get(s1)!=1)
|
|
{
|
|
n.put(s1, 1);
|
|
initialMap.put(treeMap.get(secondAst), n);
|
|
}
|
|
if(counter.containsKey(treeMap.get(secondAst)))
|
|
counter.put(treeMap.get(secondAst), counter.get(treeMap.get(secondAst))+1);
|
|
Map<AstNode, Integer> w=initialMap.get(treeMap.get(firstAst));
|
|
if(w.get(node)!=1)
|
|
{
|
|
w.put(node, 1);
|
|
initialMap.put(treeMap.get(firstAst), w);
|
|
}
|
|
if(counter.containsKey(treeMap.get(firstAst)))
|
|
counter.put(treeMap.get(firstAst), counter.get(treeMap.get(firstAst))1);
|
|
pairs=new ArrayList<>();
|
|
pairs.add(treeMap.get(secondAst));
|
|
pairs.add(treeMap.get(firstAst));
|
|
|
|
}
|
|
}
|
|
}
|
|
return count1;
|
|
}
|
|
/**
|
|
* This method calculates code for sequence plagiarism detection
|
|
* @param finalResult
|
|
* @param stringMap
|
|
*/
|
|
public void SequenceDetection(Map<List<String>, Map<String, List<Ast>>> finalResult, Map<String, Ast> stringMap)
|
|
{
|
|
for(Map.Entry<List<String>, Map<String, List<Ast>>> map: finalResult.entrySet())
|
|
{
|
|
Map<String, List<Ast>> res=map.getValue();
|
|
for(Map.Entry<String, List<Ast>> result: res.entrySet())
|
|
{
|
|
List<Ast> subtreeList=result.getValue();
|
|
List<Ast> list=new ArrayList<>();
|
|
|
|
Comparator<Ast> astLengthComparator = new Comparator<Ast>()
|
|
{
|
|
@Override
|
|
public int compare(Ast o1, Ast o2)
|
|
{
|
|
return Integer.compare(o2.getNodes().size(), o1.getNodes().size());
|
|
}
|
|
};
|
|
|
|
Collections.sort(subtreeList, astLengthComparator);
|
|
list.add(subtreeList.get(0));
|
|
|
|
for(Ast firstAst: subtreeList)
|
|
{
|
|
List<AstNode> parent=firstAst.getNodes();
|
|
int count=0;
|
|
for(Ast a :list)
|
|
{
|
|
List<AstNode> l=a.getNodes();
|
|
for(AstNode p:parent)
|
|
{
|
|
for(AstNode s: l)
|
|
{
|
|
if(s.toString().equals(p.toString()))
|
|
{
|
|
count++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if(count>=0 && count!=parent.size())
|
|
{
|
|
list.add(firstAst);
|
|
}
|
|
|
|
}
|
|
|
|
int nodeCount=stringMap.get(result.getKey()).getNodes().size();
|
|
int count=0;
|
|
for(Ast a :list)
|
|
{
|
|
count=count+a.getNodes().size();
|
|
}
|
|
double percent=((double)count/nodeCount)*100;
|
|
matchPart.put(result.getKey(), percent);
|
|
}
|
|
}
|
|
|
|
}
|
|
/**
|
|
* This method calculates result for percentage of plagiarism
|
|
* @param finalResult
|
|
* @param stringMap
|
|
*/
|
|
public void calculateResult(Map<List<String>, Map<String, List<Ast>>> finalResult, Map<String, Ast> stringMap,Map<List<String>,List<Double>> percentResult)
|
|
{
|
|
for(Map.Entry<List<String>, Map<String, List<Ast>>> map: finalResult.entrySet())
|
|
{
|
|
Map<String, List<Ast>> res=map.getValue();
|
|
List<Double> percentage=new ArrayList<>();
|
|
List<String> files=new ArrayList<>();
|
|
for(Map.Entry<String, List<Ast>> result: res.entrySet())
|
|
{
|
|
List<Ast> subtreeList=result.getValue();
|
|
Set<AstNode> setNode=new HashSet<>();
|
|
for(Ast tree:subtreeList)
|
|
{
|
|
for(AstNode astNode: tree.getNodes())
|
|
{
|
|
setNode.add(astNode);
|
|
}
|
|
}
|
|
int nodeCount=stringMap.get(result.getKey()).getNodes().size();
|
|
double percent=((double)setNode.size()/nodeCount)*100;
|
|
matchPart.put(result.getKey(), percent);
|
|
percentage.add(percent);
|
|
files.add(result.getKey());
|
|
}
|
|
for(String s: files)
|
|
System.out.println(s);
|
|
percentResult.put(files,percentage);
|
|
|
|
}
|
|
}
|
|
} |