-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
2c85ce7
commit bd7cc9a
Showing
5 changed files
with
199 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
package main.java; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.File; | ||
import java.io.FileReader; | ||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.LinkedList; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Set; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
public class InvertedIndex { | ||
List<String> stopwords = Arrays.asList("a", "able", "about", | ||
"across", "after", "all", "almost", "also", "am", "among", "an", | ||
"and", "any", "are", "as", "at", "be", "because", "been", "but", | ||
"by", "can", "cannot", "could", "dear", "did", "do", "does", | ||
"either", "else", "ever", "every", "for", "from", "get", "got", | ||
"had", "has", "have", "he", "her", "hers", "him", "his", "how", | ||
"however", "i", "if", "in", "into", "is", "it", "its", "just", | ||
"least", "let", "like", "likely", "may", "me", "might", "most", | ||
"must", "my", "neither", "no", "nor", "not", "of", "off", "often", | ||
"on", "only", "or", "other", "our", "own", "rather", "said", "say", | ||
"says", "she", "should", "since", "so", "some", "than", "that", | ||
"the", "their", "them", "then", "there", "these", "they", "this", | ||
"tis", "to", "too", "twas", "us", "wants", "was", "we", "were", | ||
"what", "when", "where", "which", "while", "who", "whom", "why", | ||
"will", "with", "would", "yet", "you", "your"); | ||
|
||
Map<String, List<Tuple>> indexedWords = new HashMap<>(); | ||
List<String> files = new ArrayList<>(); | ||
|
||
public void indexFile(File file) throws IOException { | ||
int fileNumber = files.indexOf(file.getPath()); | ||
if (fileNumber == -1) { | ||
files.add(file.getPath()); | ||
fileNumber = files.size() - 1; | ||
} | ||
|
||
BufferedReader reader = new BufferedReader(new FileReader(file)); | ||
for (String line = reader.readLine(); line != null; line = reader | ||
.readLine()) { | ||
for (String _word : line.split("\\W+")) { | ||
String word = _word.toLowerCase(); | ||
if (stopwords.contains(word)) | ||
continue; | ||
List<Tuple> idx = indexedWords.computeIfAbsent(word, k -> new LinkedList<>()); | ||
idx.add(new Tuple(fileNumber)); | ||
} | ||
} | ||
} | ||
|
||
public Set<String> search(ArrayList<String> wordsToFind) { | ||
Set<String> answer = new HashSet<>(); | ||
for (String words : wordsToFind) { | ||
|
||
String word = words.toLowerCase(); | ||
for (String key : indexedWords.keySet()) { | ||
Matcher matcher = Pattern.compile(word).matcher(key); | ||
if (matcher.find()) { | ||
List<Tuple> tupleList = indexedWords.get(key); | ||
if (tupleList != null) { | ||
for (Tuple t : tupleList) { | ||
answer.add(files.get(t.fileNumber)); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
return answer; | ||
} | ||
|
||
public Set<String> findCommonFiles(Set<String> answer, ArrayList<Set<String>> wordsToFindCommon) { | ||
Set<String> commonWords = findCommonWords(wordsToFindCommon); | ||
if (answer.size() > 0 && commonWords != null) { | ||
answer.retainAll(commonWords); | ||
return answer; | ||
} else if (answer.size() == 0 && commonWords != null) { | ||
return commonWords; | ||
} else if (answer.size() == 0) { | ||
return null; | ||
} else return answer; | ||
|
||
} | ||
|
||
public Set<String> deleteGivenFiles(Set<String> answer, Set<String> deleteFiles) { | ||
for (String s : deleteFiles) { | ||
answer.remove(s); | ||
} | ||
return answer; | ||
} | ||
|
||
|
||
public Set<String> findCommonWords(ArrayList<Set<String>> wordsToFindCommon) { | ||
if (wordsToFindCommon.size() > 0) { | ||
Set<String> commonWords = wordsToFindCommon.get(0); | ||
|
||
if (wordsToFindCommon.size() > 1) { | ||
for (int i = 1; i < wordsToFindCommon.size(); i++) { | ||
commonWords.retainAll(wordsToFindCommon.get(i)); | ||
} | ||
} | ||
return commonWords; | ||
} | ||
return null; | ||
} | ||
|
||
private static class Tuple { | ||
private final int fileNumber; | ||
|
||
public Tuple(int fileno) { | ||
this.fileNumber = fileno; | ||
} | ||
|
||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
package main.java; | ||
|
||
import java.io.File; | ||
|
||
public class Main { | ||
public static void main(String[] args) { | ||
|
||
File directoryPath = new File("C:\\Users\\ASUS\\IdeaProjects\\codestar\\src\\main\\resources\\EnglishData"); | ||
File[] filesList = directoryPath.listFiles(); | ||
|
||
|
||
try { | ||
InvertedIndex idx = new InvertedIndex(); | ||
assert filesList != null; | ||
for (File file : filesList) { | ||
idx.indexFile(file); | ||
} | ||
new TakeInput(idx); | ||
} catch (Exception e) { | ||
e.printStackTrace(); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package main.java; | ||
import java.util.*; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
public class TakeInput { | ||
|
||
public TakeInput(InvertedIndex idx) { | ||
System.out.println("1"); | ||
getOrder(idx); | ||
} | ||
|
||
public void getOrder(InvertedIndex idx) { | ||
Scanner scanner = new Scanner(System.in); | ||
while (true) { | ||
String input = scanner.nextLine(); | ||
String[] inputSplited = input.split("(\\s+)"); | ||
ArrayList<String> plusStrings = new ArrayList<>(); | ||
ArrayList<String> minusStrings = new ArrayList<>(); | ||
ArrayList<String> normalStrings = new ArrayList<>(); | ||
for (String string : inputSplited) addItemToOneOfThreeArrayLists(string, plusStrings, minusStrings, normalStrings); | ||
Set<String> answer = idx.search(plusStrings); | ||
Set<String> toDelete = idx.search(minusStrings); | ||
ArrayList<Set<String>> commons = new ArrayList<>(); | ||
for (String normalString : normalStrings) { | ||
ArrayList<String> arrayList = new ArrayList<>(); | ||
arrayList.add(normalString); | ||
commons.add(idx.search(arrayList)); | ||
} | ||
answer = idx.findCommonFiles(answer, commons); | ||
System.out.println(answer); | ||
answer = idx.deleteGivenFiles(answer, toDelete); | ||
for (String s : answer) System.out.println(s); | ||
} | ||
} | ||
|
||
private void addItemToOneOfThreeArrayLists(String string, ArrayList<String> plusStrings, ArrayList<String> minusStrings, ArrayList<String> normalStrings) { | ||
Pattern pattern = Pattern.compile("^\\+(.+)$"); | ||
Matcher matcher = pattern.matcher(string); | ||
Pattern pattern1 = Pattern.compile("^-(.+)$"); | ||
Matcher matcher1 = pattern1.matcher(string); | ||
if (matcher.find()) { | ||
String a = matcher.group(1); | ||
plusStrings.add(a); | ||
} else if (matcher1.find()) { | ||
String a = matcher1.group(1); | ||
minusStrings.add(a); | ||
} else normalStrings.add(string); | ||
} | ||
|
||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
mother I have a 42 yr old male friend, misdiagnosed as havin osteopporosis for two years, who recently found out that hi illness is the rare Gaucher's disease.Gaucher's disease symptoms include: brittle bones (he lost 9 inches off his hieght); enlarged liver and spleen; interna bleeding; and fatigue (all the time). The problem (in Type 1) i attributed to a genetic mutation where there is a lack of th enzyme glucocerebroside in macrophages so the cells swell up This will eventually cause deathEnyzme replacement therapy has been successfully developed an approved by the FDA in the last few years so that those patient administered with this drug (called Ceredase) report a remarkabl improvement in their condition. Ceredase, which is manufacture by biotech biggy company--Genzyme--costs the patient $380,00 per year. Gaucher\'s disease has justifyably been called "the mos expensive disease in the world"NEED INFOI have researched Gaucher's disease at the library but am relyin on netlanders to provide me with any additional information**news, stories, report**people you know with this diseas**ideas, articles about Genzyme Corp, how to get a hold o enough money to buy some, programs available to help wit costs**Basically ANY HELP YOU CAN OFFEThanks so very muchDeborah |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
>This wouldn't happen to be the same thing as chiggers, would it>A truly awful parasitic affliction, as I understand it. Tiny bug>dig deeply into the skin, burying themselves. Yuck! They have thes>things in OklahomaClose. My mother comes from Gainesville Tex, right across the borderThey claim to be the chigger capitol of the world, and I believe themWhen I grew up in Fort Worth it was bad enough, but in Gainesvillin the summer an attack was guaranteedDoug McDonal |