Home

In this markdown, we assume that the user has already made embeddings of a list of questions, and clustered them. The data frame uploaded must contain the questions and the cluster info.

We will take the data frame, and organize the data in terms of each cluster and the set of questions contained therein. From there, each question set will be inputted into a LLM to generate a label, which will be in the form of a “metaquestion.”

For instance, if there is a cluster of questions around IBD and the microbiome, then the label might be “What is the relationship between the microbiome and IBD patients?”

We first upload the data frame of interest.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
## here() starts at /Users/tylerburns/workspace/07_experiments/tech_projects/active/infinite_backroom
setwd(here::here("output", "monologue", "question_map_with_clustering", "kmeans_20"))

dat <- readr::read_csv("question_map_with_clustering_df.csv")
## New names:
## Rows: 9746 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): question, cluster_label, hover dbl (4): ...1, x, y, cluster
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
dat
## # A tibble: 9,746 × 7
##     ...1      x     y question                       cluster cluster_label hover
##    <dbl>  <dbl> <dbl> <chr>                            <dbl> <chr>         <chr>
##  1     0  3.33   7.64 The prompt is: Please generat…      11 C11           The …
##  2     1 -3.78   4.65 How does the diversity of the…      10 C10           How …
##  3     2  7.70   6.74 How do early-life microbial c…      14 C14           How …
##  4     3 10.6   -4.98 How does the gut microbiome c…      18 C18           How …
##  5     4 -1.67   8.94 How do changes in gut microbi…       0 C0            How …
##  6     5 -4.70   2.01 How does the gut microbiome c…       9 C9            How …
##  7     6 -0.786 -5.54 How does the gut microbiome c…       8 C8            How …
##  8     7  1.35   6.35 How does the gut microbiome c…      11 C11           How …
##  9     8 -2.18   3.36 How does the gut microbiome c…      10 C10           How …
## 10     9 -1.61   8.84 How does the gut microbiome c…       0 C0            How …
## # ℹ 9,736 more rows

Let’s pull out the quesitons, organized by cluster ID.

clust_names <- unique(dat$cluster_label)
questions <- lapply(clust_names, function(i) {
    result <- dplyr::filter(dat, cluster_label == i)
    result <- result$question
    return(result)
})
names(questions) <- clust_names
q_len <- lapply(questions, function(i) length(i)) %>% unlist()
tmp <- tibble(cluster = clust_names, num_questions = q_len)
print(tmp, n = 20)
## # A tibble: 20 × 2
##    cluster num_questions
##    <chr>           <int>
##  1 C11               910
##  2 C10               577
##  3 C14               793
##  4 C18               521
##  5 C0                835
##  6 C9                734
##  7 C8                562
##  8 C4                452
##  9 C3                138
## 10 C13               307
## 11 C7                384
## 12 C15               530
## 13 C5                650
## 14 C6                423
## 15 C16               415
## 16 C12               288
## 17 C17               324
## 18 C2                601
## 19 C19               225
## 20 C1                 77

Now we feed it into the chatbot.

# Load necessary libraries
library(httr)
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
user_api_key <- "YOUR_OPENROUTER_API_KEY"
# Define the function ask_llm that takes a prompt and a model choice as input.
Chatbot <- function(model_choice = "geminifl", user_question, api_key = user_api_key) {
  # Map the model choice to the corresponding model identifier
  if (model_choice == "geminifl") {
    model <- "google/gemini-2.5-flash-lite"
  } else {
    stop("Invalid model choice. Please choose geminifl.")
  }
  
  # Define the API endpoint and your API key (replace with your actual API key)
  url <- "https://openrouter.ai/api/v1/chat/completions"
  
  # Create the JSON payload for the POST request
  payload <- list(
    model = model,
    messages = list(
      list(
        role = "user",
        content = user_question
      )
    )
  )
  
  # Convert the payload to JSON
  payload_json <- toJSON(payload, auto_unbox = TRUE)
  
  # Send the POST request to the API
  response <- POST(
    url,
    add_headers(
      Authorization = paste("Bearer", api_key),
      "Content-Type" = "application/json"
    ),
    body = payload_json
  )
  
  # Check for errors in the API response
  if (status_code(response) != 200) {
    stop("Error: Received status code ", status_code(response), "\n", content(response, "text"))
  }
  
  # Parse the JSON response
  result <- content(response, "parsed")
  
  # Return the chatbot's reply
  return(result$choices[[1]]$message$content)
}

# Example usage of the function:
answer <- Chatbot(user_question = "What is better, ChatGPT or Claude? Please keep your answer no more than a few sentences.")
cat("Answer:", answer)
## Answer: Both ChatGPT and Claude are highly capable AI language models, but their strengths can differ. ChatGPT, especially its newer GPT-4 version, is generally known for its creativity, conversational abilities, and vast knowledge base. Claude, on the other hand, often excels in tasks requiring nuanced understanding, longer context windows, and a focus on safety and ethical AI principles. The "better" choice depends heavily on the specific task you're trying to accomplish.

Let’s make a test before we lapply the thing.

test <- questions$C11 %>% paste(collapse = "; ")

And now we query.

Chatbot(user_question = sprintf("Please give me an meta-question that encapsulates the main essence of all of the following questions, delimited by a semicolon. Please just say the question, with no extra fluff. Here are the questions: %s", test))
## [1] "How does the gut microbiome respond to and interact with dietary factors to influence host health and aging across diverse populations and conditions?"
query <- "Please give me an meta-question that encapsulates the main essence of all of the following questions, delimited by a semicolon. Please just say the question, with no extra fluff. Here are the questions: %s"
meta_questions <- lapply(questions, function(i) {
    string <- i %>% paste(collapse = "; ")
    result <- Chatbot(user_question = sprintf(query, string))
    return(result)
})
meta_questions_tib <- tibble(cluster_label = clust_names, meta_question = unlist(meta_questions))

Now we join the metaquestions data frame with our original data frame, such that the metaquestions corresponding to each cluster can be copied accordingly. This will allow us to plot them.

final <- dplyr::left_join(dat, meta_questions_tib, by = "cluster_label")
final
## # A tibble: 9,746 × 8
##     ...1      x     y question         cluster cluster_label hover meta_question
##    <dbl>  <dbl> <dbl> <chr>              <dbl> <chr>         <chr> <chr>        
##  1     0  3.33   7.64 The prompt is: …      11 C11           The … How does the…
##  2     1 -3.78   4.65 How does the di…      10 C10           How … What factors…
##  3     2  7.70   6.74 How do early-li…      14 C14           How … How does the…
##  4     3 10.6   -4.98 How does the gu…      18 C18           How … How does var…
##  5     4 -1.67   8.94 How do changes …       0 C0            How … How does the…
##  6     5 -4.70   2.01 How does the gu…       9 C9            How … How does the…
##  7     6 -0.786 -5.54 How does the gu…       8 C8            How … How does the…
##  8     7  1.35   6.35 How does the gu…      11 C11           How … How does the…
##  9     8 -2.18   3.36 How does the gu…      10 C10           How … What factors…
## 10     9 -1.61   8.84 How does the gu…       0 C0            How … How does the…
## # ℹ 9,736 more rows

And now we output what we have:

# Output
readr::write_csv(final, "question_map_with_labeled_clusters.csv")
mq_html <- knitr::kable(meta_questions_tib, format = "html")
writeLines(mq_html, "meta_questions.html")