output_file = "raw_questions.txt"

with open(output_file, "w") as file:
    pass # simply emptying file before we overwrite

%%bash
output_file="raw_questions.txt"
num_iter=100
prompt="Please generate a random research question about the microbiome. Output only the question. No extra stuff."
echo "The prompt is: $prompt"
echo

for (( i = 0; i < $num_iter; i++ ))
do
    chatbot geminifl "$prompt" >> "$output_file"
    printf '\n' >> $output_file
done

The prompt is: Please generate a random research question about the microbiome. Output only the question. No extra stuff.

import os, warnings

warnings.filterwarnings('ignore')
os.environ['PYTHONWARNINGS'] = 'ignore'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import sys
from pathlib import Path
import textwrap
from typing import List

import numpy as np
import pandas as pd
import plotly.express as px
from sentence_transformers import SentenceTransformer
import umap
from sklearn.cluster import KMeans

MODEL_NAME = "all-mpnet-base-v2"
WRAP_WIDTH = 80
UMAP_RANDOM_STATE = 42

N_CLUSTERS = 10          # k for k means; change for desired cluster number

split_by_blank = lambda t: [p.strip() for p in t.strip().split("\n\n") if p.strip()]

def embed(texts: List[str]):
    return SentenceTransformer(MODEL_NAME).encode(texts, convert_to_numpy=True, normalize_embeddings=True)

def umap_2d(embeddings):
    reducer = umap.UMAP(n_components=2, metric="cosine", random_state=UMAP_RANDOM_STATE)
    return reducer.fit_transform(embeddings)

def cluster_coords(coords):
    return KMeans(n_clusters=N_CLUSTERS, random_state=UMAP_RANDOM_STATE, n_init="auto").fit_predict(coords)

def wrap(text: str) -> str:
    return textwrap.fill(text, width=WRAP_WIDTH).replace("\n", "<br>")

def build_df(questions, coords, labels):
    df = pd.DataFrame({"x": coords[:, 0], "y": coords[:, 1],
                       "question": questions, "cluster": labels})
    
    df = df.reset_index(drop=True)
    df["question_wrapped"] = df["question"].apply(wrap)
    df["cluster_label"] = df["cluster"].astype(str)
    return df

def plot(df):
    fig = px.scatter(
        df,
        x="x",
        y="y",
        color="cluster_label",
        custom_data=["question_wrapped"],
        template="plotly_white",
        title="UMAP of research questions (clustered on UMAP coords)",
    )

    fig.update_traces(
        marker=dict(size=9, opacity=0.8),
        hovertemplate="%{customdata[0]}<extra></extra>",
    )
    fig.write_html("umap_questions_with_clustering.html", auto_open=True)

with open("raw_questions.txt", "r") as file:
    questions = file.read().splitlines()

questions = [x for x in questions if x.strip()]

if not questions:
    sys.exit("No questions found.")

print(f"{len(questions)} questions → embed → UMAP → K‑means (on coords) → plot …")

emb = embed(questions)
coords = umap_2d(emb)
labels = cluster_coords(coords)

df = build_df(questions, coords, labels)
plot(df)

# Output
emb = pd.DataFrame(emb)
emb.columns = [f"emb_{i + 1}" for i in range(emb.shape[1])]
emb.to_csv("question_map_embeddings.csv")
df.to_csv("question_map_with_clustering_df.csv")

100 questions → embed → UMAP → K‑means (on coords) → plot …

questions_list = []
for i in range(len(df["cluster"].unique())):
    cluster_questions = []
    for j in range(pd.DataFrame(df.groupby(["cluster"])[["question"]])[1][i].shape[0]):
        cluster_questions.append(np.array2string(pd.DataFrame(df.groupby(["cluster"])[["question"]])[1][i].iloc[j].values).strip("[]"))
    questions_list.append(cluster_questions)

output_file = "meta_questions.txt"

with open(output_file, "w") as file:
    pass # simply emptying file before we overwrite

for questions in questions_list:
    questions_cleaned = [question.strip('\'"') for question in questions]
    questions_str = " ".join(questions_cleaned)
    query = f"Please give me a metaquestion that encapsulates the main investigatory essense of all of the following questions. Just the question please, no extra fluff. Here are the questions: {questions_str}"
    !chatbot geminifl "$query" >> "$output_file"

meta_questions_dict = {}

with open("meta_questions.txt", "r") as file:
    meta_questions_raw = file.read().splitlines()
meta_questions_list = [question.strip('\'"') for question in meta_questions_raw]
for i in range(len(meta_questions_list)):
    meta_questions_dict[i] = meta_questions_list[i]

df["meta_question"] = df["cluster"].map(meta_questions_dict)

# wrapping the meta question and individual questions for a more aesthetically pleasing plot
df["meta_question_wrapped"] = df["meta_question"].apply(wrap)

fig = px.scatter(df, 
                 x = "x", 
                 y = "y", 
                 color = "meta_question_wrapped", 
                 hover_data = 
                 {"x": False, "y": False, "question_wrapped": True, "meta_question_wrapped": True, "cluster": False, "cluster_label": False},
                 template="plotly_white",
                 title="UMAP of research questions in metaquestion clusters"
                )

fig.update_traces(hoverlabel = dict(font_size=20))

fig.write_html("umap_questions_with_mq.html", auto_open=True)

Using LLMs to Chart the Topography of Microbiome Research¶

Introduction¶

Generating the questions¶

Making the embeddings¶

Labelling the clusters as metaquestions¶

Making the final interactive UMAP¶