Specification: Find the index of documents that contain a word that appears exactly once in the corpus, where a word is a case-sensitive string of characters separated by a space.
Example input/output:
Input:
documents| id | text |
|---|---|
1 | Hello world |
2 | Hello friend |
3 | friend of the world |
4 | Hola |
Output:
[3, 4]Python - Imperative
def documents_with_infrequent_words(documents):
words = {}
freq = defaultdict(int)
for doc in documents:
words[doc["id"]] = doc["text"].split(" ")
for word in words[doc["id"]]:
freq[word] += 1
infrequent_words = set()
for word, count in freq.items():
if count == 1:
infrequent_words.add(word)
infrequent_docs = []
for doc in documents:
for word in words[doc["id"]]:
if word in infrequent_words:
infrequent_docs.append(doc["id"])
break
return infrequent_docsPython - Functional
def documents_with_infrequent_words(documents):
words = [doc["text"].split(" ") for doc in documents]
words_flat = [w for ws in words for w in ws]
freq = {
word: words_flat.count(word)
for word in set(words_flat)
}
infrequent_words = set([
word for word, count in freq.items()
if count == 1
])
infrequent_docs = [
documents[i]["id"] for i, ws in enumerate(words)
if len(set(ws) & infrequent_words) > 0
]
return infrequent_docsPython - Pandas
def documents_with_infrequent_words(documents):
words = documents.text.str.split(" ", expand=True)
freq = words.stack().value_counts()
infrequent_words = freq[freq == 1].index.values
infrequent_docs = documents[
np.isin(words.values, infrequent_words)]
return infrequent_docs.id.unique().tolist()R - Tidyverse
documents_with_infrequent_words <- function(documents) {
split <- documents %>%
mutate(word = str_split(text, " ")) %>%
unnest()
freq <- split %>% count(word)
unique_words <- freq %>% filter(n == 1)
split %>%
filter(word %in% unique_words$word) %>%
pull(id) %>%
unique()
}SQL - SQLite
-- NOTE: SQLite tokenize is case-insensitive by default,
-- so this solution is NOT exactly like the others
CREATE VIRTUAL TABLE doc_index USING fts4(
text, id, content=documents, tokenize=simple);
INSERT INTO doc_index(doc_index) VALUES('rebuild');
CREATE VIRTUAL TABLE words USING fts4aux(doc_index);
SELECT DISTINCT id
FROM
documents
CROSS JOIN
(SELECT DISTINCT term
FROM words
WHERE occurrences = 1) unique_words
WHERE
(LOWER(text) LIKE '% ' || term || ' %') OR
(LOWER(text) LIKE term || ' %') OR
(LOWER(text) LIKE '% ' || term) OR
(LOWER(text) LIKE term)Datalog - Souffle
.decl substrs(Text:symbol, Idx:number, Len:number)
substrs(Text, 0, 1) :-
documents(_, Text), strlen(Text) > 0.
substrs(Text, 0, Len+1) :-
substrs(Text, 0, Len), Len + 1 <= strlen(Text).
substrs(Text, Idx+1, Len) :-
substrs(Text, Idx, Len), Idx + Len + 1 <= strlen(Text).
.decl token(Docid:number, Text:symbol, Idx:number, Word:symbol)
token(Docid, Text, Idx, Word) :-
documents(Docid, Text),
substrs(Text, Idx, Len),
Prev = Idx - 1, Next = Idx + Len,
(Prev < 0; " " = substr(Text, Prev, 1)),
(Next = strlen(Text); " " = substr(Text, Next, 1)),
Word = substr(Text, Idx, Len),
!contains(" ", Word).
documents_with_infrequent_words(Id) :-
documents(Id, _),
token(Id, _, _, Word),
1 = count : token(_, _, _, Word).Q - kdb+
words: (" " vs) each documents[`text];
freq: count each group raze words;
uniq: where[freq=1];
documents_with_infrequent_words:
(select id from documents where '[any; in\: [;uniq]] each words) `id