This benchmark is a work-in-progress dataset of programs implementing tabular data analytics tasks, similar to Rosetta Code or 7GUIs. This benchmark has two goals:
Category | Task name | Python - Imperative | Python - Functional | Python - Pandas | R - Tidyverse | SQL - SQLite | Datalog - Souffle | Q - kdb+ |
---|---|---|---|---|---|---|---|---|
Aggregation | Continent with the highest average population | def continent_by_population(countries): continent_stats = defaultdict(lambda: [0, 0]) for country in countries: continent = country['continent'] continent_stats[continent][0] += country['population'] continent_stats[continent][1] += 1 max_continent = None max_average = None for continent, [total, count] in continent_stats.items(): average = total / count if max_average is None or max_average < average: max_average = average max_continent = continent return max_continent | def continent_by_population(countries): continents = set([c['continent'] for c in countries]) populations_by_continent = [ (continent, [c['population'] for c in countries if c['continent'] == continent]) for continent in continents ] averages = [ (continent, sum(pops) / len(pops)) for continent, pops in populations_by_continent ] return max(averages, key=lambda t: t[1])[0] | def continent_by_population(countries): mean_pop = countries.groupby('continent').population.mean() return mean_pop.index[mean_pop.argmax()] | continent_by_population <- function(countries) { countries %>% group_by(continent) %>% summarize(mean_pop = mean(population)) %>% slice(which.max(mean_pop)) %>% .$continent } | SELECT continent FROM countries GROUP BY continent ORDER BY AVG(population) DESC LIMIT 1 | .decl average_population(Continent:symbol, Avg:number) average_population(Continent, Avg) :- countries(Continent, _, _), Total = sum P : countries(Continent, _, P), Num_countries = count : countries(Continent, _, _), Avg = Total / Num_countries. continent_by_population(Continent) :- countries(Continent, _, _), average_population(Continent, Max_avg), Max_avg = max A : { countries(C, _, _), average_population(C, A) }. | averages: select avg(population) by continent from countries; continent_by_population: (first select[>population] continent from averages) `continent |
First element that doesn't change mean | def get_mean(ls): sum = 0 for e in ls: sum += e['value'] return sum/len(ls) def changing_mean(vals): start_mean = get_mean(vals) for i,elem in enumerate(vals): new_ls = vals[:i] + vals[i+1:] mean = get_mean(new_ls) if abs(mean - start_mean) < 0.1: return i return None | def changing_mean(vals): start_mean = sum([l['value'] for l in vals])/len(vals) diff = lambda m, sm: abs(m - sm) for i,elem in enumerate(vals): new_ls = [x['value'] for x in vals if x != elem] mean = sum(new_ls)/len(new_ls) if diff(mean,start_mean) < 0.1: return i return None | def changing_mean(vals): mean = vals.value.mean() mean_without = vals.apply(lambda row: vals[vals.id != row.id].value.mean(), axis=1) return vals[(mean_without - mean).abs() < 0.1].id.tolist() | library(data.table) changing_mean <- function(vals) { global_mean <- mean(vals$value) setDT(vals)[, mean_change := abs(global_mean - mean(vals[id != .BY, value])), by = id] %>% filter(mean_change < 0.1) %>% slice(which.min(id)) %>% pull(id) } | SELECT id FROM vals v1 WHERE ABS(( SELECT AVG(value) FROM vals v2 WHERE v1.id != v2.id ) - (SELECT AVG(value) FROM vals)) < 0.1 | .decl avg_except(I:number, Avg:float) avg_except(I, Avg) :- vals(I, _), N = sum 1.0 : { vals(J, _), I != J }, Total = sum V : { vals(J, V), I != J }, Avg = Total / N. changing_mean(I) :- vals(I, _), N = sum 1.0 : vals(_, _), Total = sum V : vals(_, V), Avg = Total / N, avg_except(I, AvgExcept), ((Avg > AvgExcept, Avg - AvgExcept < 0.1); (Avg < AvgExcept, AvgExcept - Avg < 0.1)). | diffs: {abs avg vals[`value] - avg (vals[`value] where vals[`id] <> x)} each vals[`id]; changing_mean: first vals[`id] where diffs < 0.1 | |
Median population for each continent | def continent_median_population(countries): populations = defaultdict(list) for country in countries: populations[country['continent']].append(country['population']) output = [] for continent, pops in populations.items(): pops.sort() N = len(pops) if N % 2 == 1: median = pops[(N - 1) // 2] else: median = (pops[N // 2 - 1] + pops[N // 2]) / 2 output.append({ "continent": continent, "population": median }) return output | def continent_median_population(countries): continents = set([c['continent'] for c in countries]) populations = { continent: [ c['population'] for c in countries if c['continent'] == continent ] for continent in continents } def compute_median(pops): pops = sorted(pops) N = len(pops) if N % 2 == 1: return pops[(N - 1) // 2] else: return (pops[N // 2 - 1] + pops[N // 2]) / 2 return [ {"continent": continent, "population": compute_median(pops)} for continent, pops in populations.items() ] | def continent_median_population(countries): return (countries .groupby('continent') .population.median() .reset_index()) | continent_median_population <- function(countries) { countries %>% group_by(continent) %>% summarize(population = median(population)) } | SELECT continent, AVG(population) as population FROM (SELECT *, row_number() OVER (PARTITION BY continent ORDER BY population) AS rank, count() OVER (PARTITION BY continent) as count FROM countries) WHERE (count % 2 = 1 AND rank = (count + 1) / 2) OR (count % 2 = 0 AND ABS(rank - 0.5 - count / 2) = 0.5) GROUP BY continent | .decl unique_id(Country:symbol, Id:number) unique_id(Country, $) :- countries(_, Country, _). .decl rank(Continent:symbol, R:number, Population:float) rank(Continent, R_less + R_eq, Population) :- countries(Continent, Country, Population), unique_id(Country, Id), R_less = count : { countries(Continent, C, P), unique_id(C, Id2), P < Population }, R_eq = count : { countries(Continent, C, P), unique_id(C, Id2), P = Population, Id2 < Id }. continent_median_population(Continent, Median) :- countries(Continent, _, _), Num_countries = count : countries(Continent, _, _), ((Num_countries % 2 = 1, rank(Continent, (Num_countries - 1) / 2, Median)); (Num_countries % 2 = 0, rank(Continent, Num_countries / 2 - 1, P1), rank(Continent, Num_countries / 2, P2), Median = (P1 + P2) / 2)). | continent_median_population: () xkey select med[population] by continent from countries | |
Youngest person over 35 | def youngest_over_35(people): over_35 = [] for person in people: if person['age'] > 35: over_35.append(person) youngest = None for person in over_35: if youngest is None or person['age'] < youngest['age']: youngest = person return youngest['name'] if youngest is not None else None | def youngest_over_35(people): over_35 = [p for p in people if p['age'] > 35] youngest = min(over_35, default=None, key=lambda p: p['age']) return youngest['name'] if youngest is not None else None | def youngest_over_35(people): over_35 = people[people.age > 35] if len(over_35) == 0: return None else: youngest = over_35.loc[over_35.age.idxmin()] return youngest['name'] | youngest_over_35 <- function(people) { people %>% filter(age > 35) %>% slice(which.min(age)) %>% pull(name) } | SELECT name FROM people WHERE age = ( SELECT MIN(age) FROM people WHERE age > 35) LIMIT 1 | youngest_over_35(Name) :- people(Age, Name), Age = min Age : { people(Age, _), Age > 35 }. | old_enough: select from people where age > 35 youngest_over_35: (first select name from old_enough where age = min(age)) `name | |
Joins | Directors of movies Tom Hanks starred in | def tom_hanks(actors,directors): r_directors = set() for a in actors: if 'Tom Hanks' == a['actor']: for d in directors: if a['movie'] == d['movie']: r_directors.add(d['director']) return r_directors | def tom_hanks(actors,directors): movies = [a['movie'] for a in actors if a['actor'] == 'Tom Hanks'] directors = [d['director'] for d in directors if d['movie'] in movies] return directors | def tom_hanks(actors,directors): movies = list(actors.loc[ actors['actor'] == 'Tom Hanks', 'movie']) return directors[directors['movie']. isin(movies)]['director'] | tom_hanks <- function(actors, directors) { actors %>% filter(actor == "Tom Hanks") %>% inner_join(directors, by = "movie") %>% pull(director) } | SELECT directors.director FROM directors INNER JOIN actors ON directors.movie = actors.movie WHERE actors.actor="Tom Hanks" | tom_hanks(Director):- actors("Tom Hanks",movie), directors(Director,movie). | tom_hanks: (select director from actors ij (`movie xkey directors) where actor ~\: "Tom Hanks") `director |
Row per family to row per child | def row_per_child(families): children = [] for family in families: for i in [1, 2, 3]: children.append({ 'family': family['family'], 'child': f'child{i}', 'dob': family[f'dob_child{i}'], 'height': family[f'height_child{i}'] }) return children | def row_per_child(families): return [ {'family': family['family'], 'child': f'child{i}', 'dob': family[f'dob_child{i}'], 'height': family[f'height_child{i}']} for family in families for i in [1, 2, 3] ] | def row_per_child(families): df = pd.wide_to_long( families, stubnames=['dob', 'height'], sep="_child", i='family', j='child').reset_index() df.child = df.child.map(lambda c: f'child{c}') return df | row_per_child <- function(families) { families %>% pivot_longer( !family, names_to = c(".value", "child"), names_sep = "_", ) } | SELECT family, ('child' || child) AS child, (CASE child WHEN 1 THEN dob_child1 WHEN 2 THEN dob_child2 WHEN 3 THEN dob_child3 END) AS dob, (CASE child WHEN 1 THEN height_child1 WHEN 2 THEN height_child2 WHEN 3 THEN height_child3 END) AS height FROM families CROSS JOIN (SELECT 1 as child UNION VALUES (2), (3)) | row_per_child("child1", dob, family, height) :- families(dob, _, _, family, height, _, _). row_per_child("child2", dob, family, height) :- families(_, dob, _, family, _, height, _). row_per_child("child3", dob, family, height) :- families(_, _, dob, family, _, _, height). | child_rows: {[child] rows: ?[families; (); 0b; `family`dob`height ! (`family; `$("dob_child",child); `$("height_child", child))]; update child: (count families)#enlist ("child", child) from rows} row_per_child: (child_rows each ("1"; "2"; "3")) ,/; | |
Strings | Convert strings with different formats to numbers | def strings_to_numbers(numbers): output = [] for row in numbers: if row["format"] == 'comma_sep': sep = "," else: sep = "_" output.append(int(row["value"].replace(sep, ""))) return output | def strings_to_numbers(numbers): return [ int(row["value"].replace( "," if row["format"] == "comma_sep" else "_", "")) for row in numbers ] | def strings_to_numbers(numbers): def convert(row): sep = "," if row.format == 'comma_sep' else "_" return int(row.value.replace(sep, "")) return numbers.apply(convert, axis=1).tolist() | strings_to_numbers <- function(numbers) { numbers %>% mutate(output = as.numeric(str_replace_all( value, ifelse(format == "comma_sep", ",", "_"), ""))) %>% pull(output) } | SELECT CAST( REPLACE( value, CASE format WHEN "comma_sep" THEN "," WHEN "under_sep" THEN "_" END, "") AS integer) FROM numbers | .decl clean(Format:symbol, Inp:symbol, I:number, Outp:symbol) clean(Format, Inp, 0, "") :- numbers(Format, Inp). clean(Format, Inp, I+1, Outp) :- clean(Format, Inp, I, Outp_rec), I <= strlen(Inp), Chr = substr(Inp, I, 1), ((Format = "comma_sep", Sep = ","); (Format = "under_sep", Sep = "_")), ((Chr = Sep, Outp = Outp_rec); (Chr != Sep, Outp = cat(Outp_rec, Chr))). strings_to_numbers(N) :- clean(Format, Inp, strlen(Inp), Outp), N = to_number(Outp). | convert: {[val; format] sep: (("comma_sep"; "under_sep") ! (","; "_")) format; "J" $ ssr[val; sep; ""]} strings_to_numbers: convert'[numbers[`value]; numbers[`format]] |
Documents with infrequent words | def documents_with_infrequent_words(documents): words = {} freq = defaultdict(int) for doc in documents: words[doc["id"]] = doc["text"].split(" ") for word in words[doc["id"]]: freq[word] += 1 infrequent_words = set() for word, count in freq.items(): if count == 1: infrequent_words.add(word) infrequent_docs = [] for doc in documents: for word in words[doc["id"]]: if word in infrequent_words: infrequent_docs.append(doc["id"]) break return infrequent_docs | def documents_with_infrequent_words(documents): words = [doc["text"].split(" ") for doc in documents] words_flat = [w for ws in words for w in ws] freq = { word: words_flat.count(word) for word in set(words_flat) } infrequent_words = set([ word for word, count in freq.items() if count == 1 ]) infrequent_docs = [ documents[i]["id"] for i, ws in enumerate(words) if len(set(ws) & infrequent_words) > 0 ] return infrequent_docs | def documents_with_infrequent_words(documents): words = documents.text.str.split(" ", expand=True) freq = words.stack().value_counts() infrequent_words = freq[freq == 1].index.values infrequent_docs = documents[ np.isin(words.values, infrequent_words)] return infrequent_docs.id.unique().tolist() | documents_with_infrequent_words <- function(documents) { split <- documents %>% mutate(word = str_split(text, " ")) %>% unnest() freq <- split %>% count(word) unique_words <- freq %>% filter(n == 1) split %>% filter(word %in% unique_words$word) %>% pull(id) %>% unique() } | -- NOTE: SQLite tokenize is case-insensitive by default, -- so this solution is NOT exactly like the others CREATE VIRTUAL TABLE doc_index USING fts4( text, id, content=documents, tokenize=simple); INSERT INTO doc_index(doc_index) VALUES('rebuild'); CREATE VIRTUAL TABLE words USING fts4aux(doc_index); SELECT DISTINCT id FROM documents CROSS JOIN (SELECT DISTINCT term FROM words WHERE occurrences = 1) unique_words WHERE (LOWER(text) LIKE '% ' || term || ' %') OR (LOWER(text) LIKE term || ' %') OR (LOWER(text) LIKE '% ' || term) OR (LOWER(text) LIKE term) | .decl substrs(Text:symbol, Idx:number, Len:number) substrs(Text, 0, 1) :- documents(_, Text), strlen(Text) > 0. substrs(Text, 0, Len+1) :- substrs(Text, 0, Len), Len + 1 <= strlen(Text). substrs(Text, Idx+1, Len) :- substrs(Text, Idx, Len), Idx + Len + 1 <= strlen(Text). .decl token(Docid:number, Text:symbol, Idx:number, Word:symbol) token(Docid, Text, Idx, Word) :- documents(Docid, Text), substrs(Text, Idx, Len), Prev = Idx - 1, Next = Idx + Len, (Prev < 0; " " = substr(Text, Prev, 1)), (Next = strlen(Text); " " = substr(Text, Next, 1)), Word = substr(Text, Idx, Len), !contains(" ", Word). documents_with_infrequent_words(Id) :- documents(Id, _), token(Id, _, _, Word), 1 = count : token(_, _, _, Word). | words: (" " vs) each documents[`text]; freq: count each group raze words; uniq: where[freq=1]; documents_with_infrequent_words: (select id from documents where '[any; in\: [;uniq]] each words) `id | |
Filter and clean tweets | def process_tweets(data): result = [] for value in data: if (value["language"] == "en" and value["is_retweet"] == "false"): result.append({ "body": value["body"].lower(), "ts": value["ts"] }) return result | def process_tweets(data): return [ {"body": value["body"].lower(), "ts": value["ts"]} for value in data if value["language"] == "en" and value["is_retweet"] == "false" ] | def process_tweets(data): result = data[ (data.language == 'en') & (data.is_retweet == 'false')] result.body = result.body.apply(lambda s: s.lower()) return result[["body", "ts"]] | process_tweets <- function(data) { data %>% filter(language == "en" & is_retweet == "false") %>% mutate(body = tolower(body)) %>% select(ts, body) } | SELECT LOWER(body) as body, ts FROM data WHERE language = "en" and is_retweet = "false" | process_tweets: select lower[body], ts from data where (is_retweet ~\: "false") and (language ~\: "en") | ||
First-order logic | People that purchased all possible items | def purchased_all_food(food, orders): n_food = len(food) unique_orders = defaultdict(set) for order in orders: unique_orders[order["buyer"]].add(order["food"]) buyers = [] for buyer, orders in unique_orders.items(): if len(orders) == n_food: buyers.append(buyer) return buyers | def purchased_all_food(food, orders): n_food = len(food) buyers = set([order["buyer"] for order in orders]) return [ buyer for buyer in buyers if len(set([order["food"] for order in orders if order["buyer"] == buyer])) == n_food ] | def purchased_all_food(food, orders): n_food = len(food) n_unique_orders = (orders .groupby('buyer') .food.unique() .map(len)) return n_unique_orders[n_unique_orders == n_food].index.values | purchased_all_food <- function(food, orders) { n_food <- count(food) orders %>% group_by(buyer) %>% distinct(food) %>% count() %>% filter(n == n_food) %>% pull(buyer) } | SELECT DISTINCT buyer FROM orders o1 WHERE (SELECT COUNT(DISTINCT food) FROM orders o2 WHERE o1.buyer = o2.buyer) = (SELECT COUNT(*) FROM food) | .decl has_purchased(Buyer:symbol, Food:number) has_purchased(Buyer, Food) :- orders(Buyer, Food, _). purchased_all_food(Buyer) :- orders(Buyer, _, _), N_food = count : food(_, _), N_unique_orders = count : has_purchased(Buyer, _), N_food = N_unique_orders. | buyers: `buyer xgroup orders; total: count food; purchased_all_food: (where {(count distinct x[`food]) = total} each buyers) `buyer |
People who like a unique set of beer | def unique_beer_drinkers(likes): likes_per_person = defaultdict(set) for row in likes: likes_per_person[row['name']].add(row['beer']) unique = [] for p1, p1_likes in likes_per_person.items(): is_unique = True for p2, p2_likes in likes_per_person.items(): if p1 == p2: continue if p1_likes == p2_likes: is_unique = False break if is_unique: unique.append(p1) return unique | def unique_beer_drinkers(likes): people = set([row['name'] for row in likes]) likes_per_person = { name: set([row['beer'] for row in likes if row['name'] == name]) for name in people } return [ name for name, beers in likes_per_person.items() if not any([ other_name != name and beers == other_beers for other_name, other_beers in likes_per_person.items() ]) ] | def unique_beer_drinkers(likes): likes_per_person = (likes .groupby('name') .beer.unique().map(set) .reset_index()) def check_not_exists(row): other_people = likes_per_person[ likes_per_person.name != row['name']] return not (other_people.beer == row['beer']).any() unique_drinkers = likes_per_person[ likes_per_person.apply(check_not_exists, axis=1)] return unique_drinkers.name.tolist() | unique_beer_drinkers <- function(likes) { likes %>% group_by(name) %>% summarize(beer_set = list(sort(unique(beer)))) %>% add_count(beer_set, name = 'num_people_likes') %>% filter(num_people_likes == 1) %>% pull(name) } | SELECT DISTINCT L1.name FROM likes L1 WHERE NOT EXISTS( SELECT * FROM likes L2 WHERE L1.name != L2.name AND NOT EXISTS( SELECT * FROM likes L3 WHERE L3.name = L2.name AND NOT EXISTS( SELECT * FROM likes L4 WHERE L4.name = L1.name AND L4.beer = L3.beer)) AND NOT EXISTS( SELECT * FROM likes L5 WHERE L5.name = L1.name AND NOT EXISTS( SELECT * FROM likes L6 WHERE L6.name = L2.name AND L6.beer= L5.beer))) | .decl differ(a:symbol, b:symbol) differ(A, B) :- likes(Beer, A), likes(_, B), !likes(Beer, B). differ(A, B) :- likes(_, A), likes(Beer, B), !likes(Beer, A). .decl exists_same(a:symbol) exists_same(Name) :- likes(_, Other), likes(_, Name), Name != Other, !differ(Name, Other). unique_beer_drinkers(Name) :- likes(_, Name), !exists_same(Name). | likes_per_person: `name xgroup likes; counts: count each group likes_per_person; unique_beer_drinkers: (select name from likes_per_person where beer in\: where[counts=1]) `name | |
Time Series | Rolling average | def rolling_average(data): data.sort(key=lambda v: v["time"]) result = [] for i, value in enumerate(data): end = value["time"] total, count = 0.0, 0 for j in range(i, -1, -1): if data[j]["time"] <= end - 7: break total += data[j]["x"]; count += 1 result.append( {"end_time": end, "average": total / count} ) return result | def rolling_average(data): return [ {"end_time": x["time"], "average": sum(vs) / len(vs)} for x in data for vs in [ [y["x"] for y in data if y["time"] <= x["time"] and y["time"] > x["time"] - 7] ] ] | def rolling_average(data): d = data.copy() data.time = pd.to_datetime(data.time * 10**9) data = (data.sort_values('time').set_index('time') .rolling(window='7s').mean()) return pd.DataFrame.from_dict( {'end_time': d.sort_values('time').reset_index().time, 'average': data.reset_index().x} ) | library(slider) rolling_average <- function(data) { data <- arrange(data, time) avgs <- unlist(slide_index(data$x, data$time, ~ mean(.x), .before = 6)) data %>% mutate(end_time = time, average = avgs) %>% select(end_time, average) } | SELECT end.time as end_time, AVG(other.x) as average FROM data as end JOIN data as other ON other.time <= end.time and other.time > end.time - 7 GROUP BY end.time | .decl window(end_time: number, time: number) window(end_time, t) :- data(end_time, _), data(t, _), t <= end_time, t > end_time - 7. .decl bucket(end_time: number, total: float, n: float) bucket(end_time, total, n) :- data(end_time, _), total = sum v : {data(t, v), window(end_time, t)}, n = sum z : {data(t, _), window(end_time, t), z = 1.0}. rolling_average(end_time, v) :- bucket(end_time, total, n), v = total / n. | get_avg: {[t] (select avg(x) from data where time within (t - 6; t)) `x}; rolling_average: select end_time: time, average: get_avg'[time] from data |
Windowed average | def average_window(data): def window(value): return math.floor(value["time"] / 7) if len(data) < 1: return [] data.sort(key=lambda v: v["time"]) result = [] current_window = window(data[0]) total, count = data[0]["x"], 1 for value in data[1:]: time_window = window(value) if time_window != current_window: result.append(total / count) current_window, total, count = time_window, 0, 0 total += value["x"] count += 1 result.append(total / count) return result | def average_window(data): def window(value): return math.floor(value["time"] / 7) grouped_values = [ [point["x"] for point in data if window(point) == w] for w in set(map(window, data)) ] return [ sum(values) / len(values) for values in grouped_values ] | def average_window(data): def window(t): return math.floor(t / 7) result = (data.sort_values("time") .set_index("time").groupby(window).mean()) return result['x'].tolist() | average_window <- function(data) { data %>% group_by(floor(time / 7)) %>% summarize(avg = mean(x)) %>% pull(avg) } | SELECT AVG(x) as x FROM data GROUP BY cast(time / 7 as int) ORDER BY time | .decl window(w: number) window(t/7) :- data(t, _). .decl windowed(w: number, x: float) windowed(t/7, x) :- data(t, x). .decl windowed_total(w: number, x: float) windowed_total(w, total / n) :- window(w), total = sum x : { windowed(w, x) }, n = sum z : { windowed(w, x), z=1.0 }. average_window(v) :- windowed_total(_, v). | average_window: (value select[<time] avg x by 7 xbar time from data) `x | |
Graphs | Reflexive-transitive closure | def reachable(graph, query): adjacency_list = defaultdict(list) for edge in graph: adjacency_list[edge["source"]].append(edge["target"]) source = query[0]["source"] visited = set() to_visit = set([source]) while len(to_visit) > 0: current = to_visit.pop() if current in visited: continue for neighbor in adjacency_list[current]: to_visit.add(neighbor) visited.add(current) return list(visited) | def reachable(graph, query): def step(visited): frontier = set([ edge["target"] for vertex in visited for edge in graph if vertex == edge["source"] ]) return frontier.union(visited) def fix(f, x): next = f(x) return x if next == x else fix(f, next) source = query[0]["source"] return list(fix(step, set([source]))) | WITH RECURSIVE closure(source, target) AS ( SELECT source, source FROM graph UNION SELECT source, target FROM graph UNION SELECT edge.source, path.target FROM closure as path JOIN graph as edge ON edge.target = path.source ) SELECT S.target FROM closure as S JOIN query ON S.source = query.source | .decl path(x: symbol, y: symbol) path(x, y) :- graph(x, y). path(x, y) :- graph(x, z), path(z, y). reachable(source) :- query(source). reachable(x) :- query(source), path(source, x). | graph: (first') each graph; nodes: asc distinct graph[`source], graph[`target]; adj_list: `source xgroup graph; adj_matrix: {in[;x] each nodes} each nodes ,' (adj_list each nodes) `target; iterated_matrix: last ({x | any each (x *\: x)} \) adj_matrix; query_idx: nodes ? (first first query[`source]); reachable: nodes where iterated_matrix[query_idx] | ||
Strongly-connected components | def scc(graph, query): reachable = set() vertices = set() for edge in graph: reachable.add((edge["source"], edge["target"])) vertices.add(edge["source"]) vertices.add(edge["target"]) changed = True while changed: changed = False for edge in graph: for vertex in vertices: if ((edge["source"], vertex) not in reachable and (edge["target"], vertex) in reachable): changed = True reachable.add((edge["source"], vertex)) source = query[0]["source"] result = [] for vertex in vertices: if ((source, vertex) in reachable and (vertex, source) in reachable): result.append(vertex) return result | def scc(graph, query): def step(relation): return set([ (source, edge["target"]) for (source, target) in relation for edge in graph if target == edge["source"] ]).union(relation) def fix(f, x): next = f(x) return x if next == x else fix(f, next) source = query[0]["source"] init = set([(edge["source"], edge["target"]) for edge in graph]) reachable = fix(step, init) return list(set([v for (_, v) in reachable if (v, source) in reachable and (source, v) in reachable])) | WITH RECURSIVE closure(source, target) AS ( SELECT DISTINCT source, target FROM graph UNION SELECT edge.source, path.target FROM closure as path JOIN graph as edge ON edge.target = path.source ), component(v1, v2) AS ( SELECT path.source, path.target FROM closure as path JOIN closure as path_ ON path.source = path_.target AND path.target = path_.source ) SELECT S.v2 FROM component as S JOIN query ON S.v1 = query.source | .decl scc1(x: symbol, y: symbol) scc1(x, x) :- vertex(x). scc1(x, y) :- path(x, y), path(y, x). .decl path(x: symbol, y: symbol) path(x, y) :- graph(x, y). path(x,y) :- graph(x, z), path(z, y). .decl vertex(x: symbol) vertex(x) :- graph(x, _). vertex(x) :- graph(_, x). scc(x) :- query(src), scc1(src, x). |