In this markdown, we assume that the user has already made embeddings of a list of questions, and clustered them. The data frame uploaded must contain the questions and the cluster info.
We will take the data frame, and organize the data in terms of each cluster and the set of questions contained therein. From there, each question set will be inputted into a LLM to generate a label, which will be in the form of a “metaquestion.”
For instance, if there is a cluster of questions around IBD and the microbiome, then the label might be “What is the relationship between the microbiome and IBD patients?”
We first upload the data frame of interest.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
## here() starts at /Users/tylerburns/workspace/07_experiments/tech_projects/active/infinite_backroom
setwd(here::here("output", "monologue", "question_map_with_clustering", "kmeans_20"))
dat <- readr::read_csv("question_map_with_clustering_df.csv")
## New names:
## Rows: 9746 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): question, cluster_label, hover dbl (4): ...1, x, y, cluster
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
dat
## # A tibble: 9,746 × 7
## ...1 x y question cluster cluster_label hover
## <dbl> <dbl> <dbl> <chr> <dbl> <chr> <chr>
## 1 0 3.33 7.64 The prompt is: Please generat… 11 C11 The …
## 2 1 -3.78 4.65 How does the diversity of the… 10 C10 How …
## 3 2 7.70 6.74 How do early-life microbial c… 14 C14 How …
## 4 3 10.6 -4.98 How does the gut microbiome c… 18 C18 How …
## 5 4 -1.67 8.94 How do changes in gut microbi… 0 C0 How …
## 6 5 -4.70 2.01 How does the gut microbiome c… 9 C9 How …
## 7 6 -0.786 -5.54 How does the gut microbiome c… 8 C8 How …
## 8 7 1.35 6.35 How does the gut microbiome c… 11 C11 How …
## 9 8 -2.18 3.36 How does the gut microbiome c… 10 C10 How …
## 10 9 -1.61 8.84 How does the gut microbiome c… 0 C0 How …
## # ℹ 9,736 more rows
Let’s pull out the quesitons, organized by cluster ID.
clust_names <- unique(dat$cluster_label)
questions <- lapply(clust_names, function(i) {
result <- dplyr::filter(dat, cluster_label == i)
result <- result$question
return(result)
})
names(questions) <- clust_names
q_len <- lapply(questions, function(i) length(i)) %>% unlist()
tmp <- tibble(cluster = clust_names, num_questions = q_len)
print(tmp, n = 20)
## # A tibble: 20 × 2
## cluster num_questions
## <chr> <int>
## 1 C11 910
## 2 C10 577
## 3 C14 793
## 4 C18 521
## 5 C0 835
## 6 C9 734
## 7 C8 562
## 8 C4 452
## 9 C3 138
## 10 C13 307
## 11 C7 384
## 12 C15 530
## 13 C5 650
## 14 C6 423
## 15 C16 415
## 16 C12 288
## 17 C17 324
## 18 C2 601
## 19 C19 225
## 20 C1 77
Now we feed it into the chatbot.
# Load necessary libraries
library(httr)
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
user_api_key <- "YOUR_OPENROUTER_API_KEY"
# Define the function ask_llm that takes a prompt and a model choice as input.
Chatbot <- function(model_choice = "geminifl", user_question, api_key = user_api_key) {
# Map the model choice to the corresponding model identifier
if (model_choice == "geminifl") {
model <- "google/gemini-2.5-flash-lite"
} else {
stop("Invalid model choice. Please choose geminifl.")
}
# Define the API endpoint and your API key (replace with your actual API key)
url <- "https://openrouter.ai/api/v1/chat/completions"
# Create the JSON payload for the POST request
payload <- list(
model = model,
messages = list(
list(
role = "user",
content = user_question
)
)
)
# Convert the payload to JSON
payload_json <- toJSON(payload, auto_unbox = TRUE)
# Send the POST request to the API
response <- POST(
url,
add_headers(
Authorization = paste("Bearer", api_key),
"Content-Type" = "application/json"
),
body = payload_json
)
# Check for errors in the API response
if (status_code(response) != 200) {
stop("Error: Received status code ", status_code(response), "\n", content(response, "text"))
}
# Parse the JSON response
result <- content(response, "parsed")
# Return the chatbot's reply
return(result$choices[[1]]$message$content)
}
# Example usage of the function:
answer <- Chatbot(user_question = "What is better, ChatGPT or Claude? Please keep your answer no more than a few sentences.")
cat("Answer:", answer)
## Answer: Both ChatGPT and Claude are highly capable AI language models, but their strengths can differ. ChatGPT, especially its newer GPT-4 version, is generally known for its creativity, conversational abilities, and vast knowledge base. Claude, on the other hand, often excels in tasks requiring nuanced understanding, longer context windows, and a focus on safety and ethical AI principles. The "better" choice depends heavily on the specific task you're trying to accomplish.
Let’s make a test before we lapply the thing.
test <- questions$C11 %>% paste(collapse = "; ")
And now we query.
Chatbot(user_question = sprintf("Please give me an meta-question that encapsulates the main essence of all of the following questions, delimited by a semicolon. Please just say the question, with no extra fluff. Here are the questions: %s", test))
## [1] "How does the gut microbiome respond to and interact with dietary factors to influence host health and aging across diverse populations and conditions?"
query <- "Please give me an meta-question that encapsulates the main essence of all of the following questions, delimited by a semicolon. Please just say the question, with no extra fluff. Here are the questions: %s"
meta_questions <- lapply(questions, function(i) {
string <- i %>% paste(collapse = "; ")
result <- Chatbot(user_question = sprintf(query, string))
return(result)
})
meta_questions_tib <- tibble(cluster_label = clust_names, meta_question = unlist(meta_questions))
Now we join the metaquestions data frame with our original data frame, such that the metaquestions corresponding to each cluster can be copied accordingly. This will allow us to plot them.
final <- dplyr::left_join(dat, meta_questions_tib, by = "cluster_label")
final
## # A tibble: 9,746 × 8
## ...1 x y question cluster cluster_label hover meta_question
## <dbl> <dbl> <dbl> <chr> <dbl> <chr> <chr> <chr>
## 1 0 3.33 7.64 The prompt is: … 11 C11 The … How does the…
## 2 1 -3.78 4.65 How does the di… 10 C10 How … What factors…
## 3 2 7.70 6.74 How do early-li… 14 C14 How … How does the…
## 4 3 10.6 -4.98 How does the gu… 18 C18 How … How does var…
## 5 4 -1.67 8.94 How do changes … 0 C0 How … How does the…
## 6 5 -4.70 2.01 How does the gu… 9 C9 How … How does the…
## 7 6 -0.786 -5.54 How does the gu… 8 C8 How … How does the…
## 8 7 1.35 6.35 How does the gu… 11 C11 How … How does the…
## 9 8 -2.18 3.36 How does the gu… 10 C10 How … What factors…
## 10 9 -1.61 8.84 How does the gu… 0 C0 How … How does the…
## # ℹ 9,736 more rows
And now we output what we have:
# Output
readr::write_csv(final, "question_map_with_labeled_clusters.csv")
mq_html <- knitr::kable(meta_questions_tib, format = "html")
writeLines(mq_html, "meta_questions.html")