Expressiveness Benchmark

Task: Documents with infrequent words

Specification: Find the index of documents that contain a word that appears exactly once in the corpus, where a word is a case-sensitive string of characters separated by a space.

Input: documents

id	text
`1`	`Hello world`
`2`	`Hello friend`
`3`	`friend of the world`
`4`	`Hola`

Output: [3, 4]

Python - Imperative

def documents_with_infrequent_words(documents):
  words = {}
  freq = defaultdict(int)
  for doc in documents:
    words[doc["id"]] = doc["text"].split(" ")
    for word in words[doc["id"]]:
      freq[word] += 1
      
  infrequent_words = set()
  for word, count in freq.items():
    if count == 1:
      infrequent_words.add(word)
      
  infrequent_docs = []
  for doc in documents:
    for word in words[doc["id"]]:
      if word in infrequent_words:
        infrequent_docs.append(doc["id"])
        break
        
  return infrequent_docs

Python - Functional

def documents_with_infrequent_words(documents):
  words = [doc["text"].split(" ") for doc in documents]
  words_flat = [w for ws in words for w in ws]
  freq = {
    word: words_flat.count(word) 
    for word in set(words_flat)
  }
  infrequent_words = set([
    word for word, count in freq.items() 
    if count == 1
  ])
  infrequent_docs = [
    documents[i]["id"] for i, ws in enumerate(words) 
    if len(set(ws) & infrequent_words) > 0
  ]
  return infrequent_docs

Python - Pandas

def documents_with_infrequent_words(documents):
  words = documents.text.str.split(" ", expand=True)
  freq = words.stack().value_counts()
  infrequent_words = freq[freq == 1].index.values
  infrequent_docs = documents[
    np.isin(words.values, infrequent_words)]
  return infrequent_docs.id.unique().tolist()

R - Tidyverse

documents_with_infrequent_words <- function(documents) {
  split <- documents %>%
    mutate(word = str_split(text, " ")) %>%
    unnest()
  freq <- split %>% count(word)
  unique_words <- freq %>% filter(n == 1)
  split %>% 
    filter(word %in% unique_words$word) %>%
    pull(id) %>%
    unique()
}

SQL - SQLite

-- NOTE: SQLite tokenize is case-insensitive by default, 
-- so this solution is NOT exactly like the others

CREATE VIRTUAL TABLE doc_index USING fts4(
  text, id, content=documents, tokenize=simple);    
INSERT INTO doc_index(doc_index) VALUES('rebuild');
CREATE VIRTUAL TABLE words USING fts4aux(doc_index);    

SELECT DISTINCT id
FROM 
  documents
  CROSS JOIN
  (SELECT DISTINCT term
   FROM words
   WHERE occurrences = 1) unique_words
WHERE
  (LOWER(text) LIKE '% ' || term || ' %') OR
  (LOWER(text) LIKE term || ' %') OR
  (LOWER(text) LIKE '% ' || term) OR
  (LOWER(text) LIKE term)

Datalog - Souffle

.decl substrs(Text:symbol, Idx:number, Len:number)
substrs(Text, 0, 1) :- 
  documents(_, Text), strlen(Text) > 0.
substrs(Text, 0, Len+1) :- 
  substrs(Text, 0, Len), Len + 1 <= strlen(Text).
substrs(Text, Idx+1, Len) :- 
  substrs(Text, Idx, Len), Idx + Len + 1 <= strlen(Text).

.decl token(Docid:number, Text:symbol, Idx:number, Word:symbol)
token(Docid, Text, Idx, Word) :-
  documents(Docid, Text),
  substrs(Text, Idx, Len),
  Prev = Idx - 1, Next = Idx + Len,
  (Prev < 0; " " = substr(Text, Prev, 1)),
  (Next = strlen(Text); " " = substr(Text, Next, 1)),
  Word = substr(Text, Idx, Len),
  !contains(" ", Word).

documents_with_infrequent_words(Id) :-
  documents(Id, _),
  token(Id, _, _, Word),
  1 = count : token(_, _, _, Word).

Q - kdb+

words: (" " vs) each documents[`text];
freq: count each group raze words;
uniq: where[freq=1];
documents_with_infrequent_words:
  (select id from documents where '[any; in\: [;uniq]] each words) `id