Specification: Find the index of documents that contain a word that appears exactly once in the corpus, where a word is a case-sensitive string of characters separated by a space.
Example input/output:
id | text |
1 | Hello world |
2 | Hello friend |
3 | friend of the world |
4 | Hola |
[3, 4]
Python - Imperative
def documents_with_infrequent_words(documents): words = {} freq = defaultdict(int) for doc in documents: words[doc["id"]] = doc["text"].split(" ") for word in words[doc["id"]]: freq[word] += 1 infrequent_words = set() for word, count in freq.items(): if count == 1: infrequent_words.add(word) infrequent_docs = [] for doc in documents: for word in words[doc["id"]]: if word in infrequent_words: infrequent_docs.append(doc["id"]) break return infrequent_docs
Python - Functional
def documents_with_infrequent_words(documents): words = [doc["text"].split(" ") for doc in documents] words_flat = [w for ws in words for w in ws] freq = { word: words_flat.count(word) for word in set(words_flat) } infrequent_words = set([ word for word, count in freq.items() if count == 1 ]) infrequent_docs = [ documents[i]["id"] for i, ws in enumerate(words) if len(set(ws) & infrequent_words) > 0 ] return infrequent_docs
Python - Pandas
def documents_with_infrequent_words(documents): words = documents.text.str.split(" ", expand=True) freq = words.stack().value_counts() infrequent_words = freq[freq == 1].index.values infrequent_docs = documents[ np.isin(words.values, infrequent_words)] return infrequent_docs.id.unique().tolist()
R - Tidyverse
documents_with_infrequent_words <- function(documents) { split <- documents %>% mutate(word = str_split(text, " ")) %>% unnest() freq <- split %>% count(word) unique_words <- freq %>% filter(n == 1) split %>% filter(word %in% unique_words$word) %>% pull(id) %>% unique() }
SQL - SQLite
-- NOTE: SQLite tokenize is case-insensitive by default, -- so this solution is NOT exactly like the others CREATE VIRTUAL TABLE doc_index USING fts4( text, id, content=documents, tokenize=simple); INSERT INTO doc_index(doc_index) VALUES('rebuild'); CREATE VIRTUAL TABLE words USING fts4aux(doc_index); SELECT DISTINCT id FROM documents CROSS JOIN (SELECT DISTINCT term FROM words WHERE occurrences = 1) unique_words WHERE (LOWER(text) LIKE '% ' || term || ' %') OR (LOWER(text) LIKE term || ' %') OR (LOWER(text) LIKE '% ' || term) OR (LOWER(text) LIKE term)
Datalog - Souffle
.decl substrs(Text:symbol, Idx:number, Len:number) substrs(Text, 0, 1) :- documents(_, Text), strlen(Text) > 0. substrs(Text, 0, Len+1) :- substrs(Text, 0, Len), Len + 1 <= strlen(Text). substrs(Text, Idx+1, Len) :- substrs(Text, Idx, Len), Idx + Len + 1 <= strlen(Text). .decl token(Docid:number, Text:symbol, Idx:number, Word:symbol) token(Docid, Text, Idx, Word) :- documents(Docid, Text), substrs(Text, Idx, Len), Prev = Idx - 1, Next = Idx + Len, (Prev < 0; " " = substr(Text, Prev, 1)), (Next = strlen(Text); " " = substr(Text, Next, 1)), Word = substr(Text, Idx, Len), !contains(" ", Word). documents_with_infrequent_words(Id) :- documents(Id, _), token(Id, _, _, Word), 1 = count : token(_, _, _, Word).
Q - kdb+
words: (" " vs) each documents[`text]; freq: count each group raze words; uniq: where[freq=1]; documents_with_infrequent_words: (select id from documents where '[any; in\: [;uniq]] each words) `id