Trying some weird training nonsense. Maybe this will be fun.
All checks were successful
Docker Deploy / build-and-push (push) Successful in 3m23s

This commit is contained in:
2026-01-20 14:48:53 -07:00
parent 0081978489
commit 9694a42f3f
49 changed files with 190186 additions and 159 deletions

View File

@@ -28,5 +28,8 @@ COPY --from=build /go/bin/app /app/himbot
# Copy migrations directory
COPY --from=build /app/migrations /app/migrations
# Copy datasets directory
COPY --from=build /app/datasets /app/datasets
# Set the entrypoint
ENTRYPOINT ["/app/himbot"]

66
cmd/train/main.go Normal file
View File

@@ -0,0 +1,66 @@
package main
import (
"encoding/gob"
"flag"
"himbot/lib"
"log"
"os"
"path/filepath"
"strings"
)
func main() {
inputDir := flag.String("input", "datasets/bard", "Directory containing text files to train on")
outputFile := flag.String("output", "datasets/bard.gob", "Output file path for the pre-trained model")
order := flag.Int("order", 3, "Markov chain order (N-gram size)")
flag.Parse()
log.Printf("Scanning directory: %s", *inputDir)
var allLines []string
fileCount := 0
err := filepath.Walk(*inputDir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() && strings.HasSuffix(info.Name(), ".txt") {
content, err := os.ReadFile(path)
if err != nil {
log.Printf("Error reading file %s: %v", path, err)
return nil // Continue to next file
}
lines := strings.Split(string(content), "\n")
allLines = append(allLines, lines...)
fileCount++
if fileCount%5 == 0 {
log.Printf("Processed %d files...", fileCount)
}
}
return nil
})
if err != nil {
log.Fatalf("Error walking directory: %v", err)
}
log.Printf("Found %d files with %d total lines. Building Markov chain...", fileCount, len(allLines))
chain := lib.BuildMarkovChain(allLines, *order)
log.Printf("Chain built with %d start keys. Saving to %s...", len(chain.Starts), *outputFile)
f, err := os.Create(*outputFile)
if err != nil {
log.Fatalf("Failed to create output file: %v", err)
}
defer f.Close()
encoder := gob.NewEncoder(f)
if err := encoder.Encode(chain); err != nil {
log.Fatalf("Failed to encode chain: %v", err)
}
log.Println("Done!")
}

View File

@@ -2,73 +2,69 @@ package command
import (
"crypto/md5"
"encoding/gob"
"fmt"
"himbot/lib"
"math/rand"
"regexp"
"os"
"strings"
"sync"
"time"
"github.com/bwmarrin/discordgo"
)
type MarkovData struct {
Chain map[string][]string // "word1 word2" -> ["word3", ...]
Starts []string
}
type MarkovCache struct {
data map[string]*MarkovData
data map[string]*lib.MarkovData
hashes map[string]string
mu sync.RWMutex
}
var (
markovCache = &MarkovCache{
data: make(map[string]*MarkovData),
data: make(map[string]*lib.MarkovData),
hashes: make(map[string]string),
}
urlRegex = regexp.MustCompile(`https?://[^\s]+`)
mentionRegex = regexp.MustCompile(`<[@#&!][^>]+>`)
bardChain *lib.MarkovData
)
func MarkovCommand(s *discordgo.Session, i *discordgo.InteractionCreate) (string, error) {
channelID := i.ChannelID
numMessages := lib.AppConfig.MarkovDefaultMessages
if len(i.ApplicationCommandData().Options) > 0 {
if i.ApplicationCommandData().Options[0].Name == "messages" {
numMessages = int(i.ApplicationCommandData().Options[0].IntValue())
if numMessages <= 0 {
numMessages = lib.AppConfig.MarkovDefaultMessages
} else if numMessages > lib.AppConfig.MarkovMaxMessages {
numMessages = lib.AppConfig.MarkovMaxMessages
}
}
}
cacheKey := fmt.Sprintf("%s:%d", channelID, numMessages)
if data := getCachedChain(cacheKey); data != nil {
if msg := generateMessage(data, ""); msg != "" {
return msg, nil
}
}
allMessages, err := fetchMessages(s, channelID, numMessages)
func InitBard(modelPath string) error {
f, err := os.Open(modelPath)
if err != nil {
return "", err
return err
}
defer f.Close()
var data lib.MarkovData
decoder := gob.NewDecoder(f)
if err := decoder.Decode(&data); err != nil {
return err
}
data := buildMarkovChain(allMessages)
setCachedChain(cacheKey, data, allMessages)
bardChain = &data
return nil
}
newMessage := generateMessage(data, "")
if newMessage == "" {
newMessage = "Not enough text data to generate a message."
func BardCommand(s *discordgo.Session, i *discordgo.InteractionCreate) (string, error) {
if bardChain == nil {
return "The bard is sleeping (dataset not loaded).", nil
}
return newMessage, nil
var question string
for _, option := range i.ApplicationCommandData().Options {
if option.Name == "question" {
question = option.StringValue()
}
}
answer := lib.GenerateMessage(bardChain, question)
if answer == "" {
answer = "Words fail me."
}
if question != "" {
return fmt.Sprintf("**Q:** %s\n**A:** %s", question, answer), nil
}
return answer, nil
}
func MarkovQuestionCommand(s *discordgo.Session, i *discordgo.InteractionCreate) (string, error) {
@@ -95,7 +91,7 @@ func MarkovQuestionCommand(s *discordgo.Session, i *discordgo.InteractionCreate)
}
cacheKey := fmt.Sprintf("%s:%d", channelID, numMessages)
var data *MarkovData
var data *lib.MarkovData
if cachedData := getCachedChain(cacheKey); cachedData != nil {
data = cachedData
@@ -104,11 +100,18 @@ func MarkovQuestionCommand(s *discordgo.Session, i *discordgo.InteractionCreate)
if err != nil {
return "", err
}
data = buildMarkovChain(allMessages)
setCachedChain(cacheKey, data, allMessages)
var texts []string
for _, msg := range allMessages {
texts = append(texts, msg.Content)
}
answer := generateMessage(data, question)
// Use order 2 for chat history (sparse data)
data = lib.BuildMarkovChain(texts, 2)
setCachedChain(cacheKey, data, hashMessages(allMessages))
}
answer := lib.GenerateMessage(data, question)
if answer == "" {
answer = "I don't have enough context to answer that."
}
@@ -116,15 +119,13 @@ func MarkovQuestionCommand(s *discordgo.Session, i *discordgo.InteractionCreate)
return fmt.Sprintf("**Q:** %s\n**A:** %s", question, answer), nil
}
func getCachedChain(cacheKey string) *MarkovData {
func getCachedChain(cacheKey string) *lib.MarkovData {
markovCache.mu.RLock()
defer markovCache.mu.RUnlock()
return markovCache.data[cacheKey]
}
func setCachedChain(cacheKey string, data *MarkovData, messages []*discordgo.Message) {
hash := hashMessages(messages)
func setCachedChain(cacheKey string, data *lib.MarkovData, hash string) {
markovCache.mu.Lock()
defer markovCache.mu.Unlock()
@@ -186,106 +187,3 @@ func fetchMessages(s *discordgo.Session, channelID string, numMessages int) ([]*
return allMessages, nil
}
func cleanText(text string) string {
text = urlRegex.ReplaceAllString(text, "")
text = mentionRegex.ReplaceAllString(text, "")
return strings.Join(strings.Fields(text), " ")
}
func buildMarkovChain(messages []*discordgo.Message) *MarkovData {
data := &MarkovData{
Chain: make(map[string][]string),
Starts: make([]string, 0),
}
for _, msg := range messages {
cleaned := cleanText(msg.Content)
if cleaned == "" {
continue
}
words := strings.Fields(cleaned)
if len(words) < 3 {
continue
}
startKey := key(words[0], words[1])
data.Starts = append(data.Starts, startKey)
for i := 0; i < len(words)-2; i++ {
k := key(words[i], words[i+1])
val := words[i+2]
data.Chain[k] = append(data.Chain[k], val)
}
}
return data
}
func generateMessage(data *MarkovData, seed string) string {
if len(data.Starts) == 0 {
return ""
}
var w1, w2 string
var currentKey string
// Try to seed based on input question
if seed != "" {
seedWords := strings.Fields(cleanText(seed))
var candidates []string
for k := range data.Chain {
for _, sw := range seedWords {
if len(sw) > 3 && strings.Contains(strings.ToLower(k), strings.ToLower(sw)) {
candidates = append(candidates, k)
}
}
}
if len(candidates) > 0 {
currentKey = candidates[rand.Intn(len(candidates))]
}
}
if currentKey == "" {
currentKey = data.Starts[rand.Intn(len(data.Starts))]
}
parts := strings.Split(currentKey, " ")
w1, w2 = parts[0], parts[1]
output := []string{w1, w2}
for i := 0; i < 40; i++ {
nextOptions, exists := data.Chain[currentKey]
if !exists || len(nextOptions) == 0 {
break
}
nextWord := nextOptions[rand.Intn(len(nextOptions))]
output = append(output, nextWord)
w1 = w2
w2 = nextWord
currentKey = key(w1, w2)
// Soft stop on punctuation
if i > 5 && strings.ContainsAny(nextWord, ".!?") {
if rand.Float32() > 0.3 {
break
}
}
}
return strings.Join(output, " ")
}
func key(w1, w2 string) string {
return w1 + " " + w2
}
func init() {
rand.Seed(time.Now().UnixNano())
}

BIN
datasets/bard.gob Normal file

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

7063
datasets/bard/poems.txt Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,102 @@
The Phoenix and Turtle
by William Shakespeare
Edited by Barbara A. Mowat and Paul Werstine
with Michael Poston and Rebecca Niles
Folger Shakespeare Library
https://shakespeare.folger.edu/shakespeares-works/the-phoenix-and-turtle/
Created on Jul 31, 2015, from FDT version 0.9.0.1
"The Phoenix and Turtle"
Let the bird of loudest lay
On the sole Arabian tree
Herald sad and trumpet be,
To whose sound chaste wings obey.
But thou shrieking harbinger,
Foul precurrer of the fiend,
Augur of the fever's end,
To this troop come thou not near.
From this session interdict
Every fowl of tyrant wing,
Save the eagle, feathered king;
Keep the obsequy so strict.
Let the priest in surplice white,
That defunctive music can,
Be the death-divining swan,
Lest the requiem lack his right.
And thou treble-dated crow,
That thy sable gender mak'st
With the breath thou giv'st and tak'st,
'Mongst our mourners shalt thou go.
Here the anthem doth commence:
Love and constancy is dead,
Phoenix and the turtle fled
In a mutual flame from hence.
So they loved, as love in twain
Had the essence but in one,
Two distincts, division none;
Number there in love was slain.
Hearts remote yet not asunder,
Distance and no space was seen
'Twixt this turtle and his queen;
But in them it were a wonder.
So between them love did shine
That the turtle saw his right
Flaming in the phoenix' sight;
Either was the other's mine.
Property was thus appalled
That the self was not the same;
Single nature's double name
Neither two nor one was called.
Reason, in itself confounded,
Saw division grow together,
To themselves yet either neither,
Simple were so well compounded
That it cried, "How true a twain
Seemeth this concordant one!
Love hath reason, Reason none,
If what parts can so remain,"
Whereupon it made this threne
To the phoenix and the dove,
Co-supremes and stars of love,
As chorus to their tragic scene.
Threnos
Beauty, truth, and rarity,
Grace in all simplicity,
Here enclosed, in cinders lie.
Death is now the phoenix' nest,
And the turtle's loyal breast
To eternity doth rest,
Leaving no posterity;
'Twas not their infirmity,
It was married chastity.
Truth may seem, but cannot be;
Beauty brag, but 'tis not she;
Truth and beauty buried be.
To this urn let those repair
That are either true or fair;
For these dead birds sigh a prayer.
William Shakespeare

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

262
lib/markov.go Normal file
View File

@@ -0,0 +1,262 @@
package lib
import (
"math/rand"
"regexp"
"strings"
"time"
)
type MarkovData struct {
Order int
Chain map[string][]string // "word1 ... wordN" -> ["word3", ...]
Starts []string
}
var (
urlRegex = regexp.MustCompile(`https?://[^\s]+`)
mentionRegex = regexp.MustCompile(`<[@#&!][^>]+>`)
bracketRegex = regexp.MustCompile(`\[.*?\]`)
speakerRegex = regexp.MustCompile(`^(?:[A-Z]{2,}\s+)+`)
stopWords = map[string]bool{
"the": true, "and": true, "a": true, "to": true, "of": true,
"in": true, "is": true, "that": true, "it": true, "for": true,
"as": true, "with": true, "on": true, "at": true, "by": true,
"this": true, "from": true, "but": true, "or": true, "an": true,
"be": true, "are": true, "was": true, "were": true, "so": true,
"if": true, "out": true, "up": true, "about": true, "into": true,
"over": true, "after": true, "beneath": true, "under": true,
"above": true, "me": true, "my": true, "mine": true, "you": true,
"your": true, "yours": true, "he": true, "him": true, "his": true,
"she": true, "her": true, "hers": true, "they": true, "them": true,
"their": true, "theirs": true, "we": true, "us": true, "our": true,
"ours": true, "who": true, "whom": true, "whose": true, "what": true,
"which": true, "when": true, "where": true, "why": true, "how": true,
"give": true, "write": true, "tell": true, "say": true, "speak": true,
"make": true, "do": true, "does": true, "did": true, "done": true,
}
)
func init() {
rand.Seed(time.Now().UnixNano())
}
func CleanText(text string) string {
text = urlRegex.ReplaceAllString(text, "")
text = mentionRegex.ReplaceAllString(text, "")
text = bracketRegex.ReplaceAllString(text, "")
text = strings.TrimSpace(text)
text = speakerRegex.ReplaceAllString(text, "")
return strings.Join(strings.Fields(text), " ")
}
func BuildMarkovChain(lines []string, order int) *MarkovData {
data := &MarkovData{
Order: order,
Chain: make(map[string][]string),
Starts: make([]string, 0),
}
var allWords []string
for _, line := range lines {
// Skip likely headers/metadata (all caps lines)
trimmed := strings.TrimSpace(line)
if trimmed != "" && strings.ToUpper(trimmed) == trimmed && strings.ToLower(trimmed) != trimmed {
continue
}
cleaned := CleanText(line)
if cleaned == "" {
continue
}
allWords = append(allWords, strings.Fields(cleaned)...)
}
if len(allWords) < order+1 {
return data
}
// First key is always a start
data.Starts = append(data.Starts, Key(allWords[:order]...))
for i := 0; i < len(allWords)-order; i++ {
keyWords := allWords[i : i+order]
nextWord := allWords[i+order]
k := Key(keyWords...)
data.Chain[k] = append(data.Chain[k], nextWord)
// If the word shifting out ends a sentence, the next sequence is a start
if strings.ContainsAny(allWords[i], ".!?") {
if i+1+order <= len(allWords) {
data.Starts = append(data.Starts, Key(allWords[i+1:i+1+order]...))
}
}
}
return data
}
func GenerateMessage(data *MarkovData, seed string) string {
if len(data.Starts) == 0 {
return ""
}
var currentKey string
// Try to seed based on input question
if seed != "" {
seedWords := strings.Fields(CleanText(seed))
// Sort seed words: significant words first, then by length
for i := 0; i < len(seedWords); i++ {
for j := i + 1; j < len(seedWords); j++ {
sw1 := strings.ToLower(seedWords[i])
sw2 := strings.ToLower(seedWords[j])
isStop1 := stopWords[sw1]
isStop2 := stopWords[sw2]
// If one is a stop word and the other isn't, prioritize the non-stop word
if isStop1 && !isStop2 {
seedWords[i], seedWords[j] = seedWords[j], seedWords[i]
} else if !isStop1 && isStop2 {
continue
} else {
// Otherwise sort by length
if len(seedWords[i]) < len(seedWords[j]) {
seedWords[i], seedWords[j] = seedWords[j], seedWords[i]
}
}
}
}
var candidates []string
// 1. Try to find a sentence starter
// We iterate seed words first to prioritize matches for longer words
for _, sw := range seedWords {
if len(sw) <= 2 {
continue
}
swLower := strings.ToLower(sw)
var primaryMatches []string // starts with word
for _, startKey := range data.Starts {
parts := strings.Fields(strings.ToLower(startKey))
if len(parts) < data.Order {
continue
}
if parts[0] == swLower {
primaryMatches = append(primaryMatches, startKey)
}
}
// If we found sentence starters beginning with this word, use them exclusively
if len(primaryMatches) > 0 {
candidates = primaryMatches
break
}
}
// 2. If no perfect starts, try any start containing the word
if len(candidates) == 0 {
for _, sw := range seedWords {
if len(sw) <= 2 {
continue
}
swLower := strings.ToLower(sw)
for _, startKey := range data.Starts {
parts := strings.Fields(strings.ToLower(startKey))
if len(parts) < data.Order {
continue
}
// Check remaining words in key
found := false
for i := 1; i < len(parts); i++ {
if parts[i] == swLower {
found = true
break
}
}
if found {
candidates = append(candidates, startKey)
}
}
if len(candidates) > 0 {
break
}
}
}
// 3. If no starts, try to find any connection in the chain
if len(candidates) == 0 {
for _, sw := range seedWords {
if len(sw) <= 2 {
continue
}
swLower := strings.ToLower(sw)
var matches []string
for k := range data.Chain {
parts := strings.Fields(strings.ToLower(k))
if len(parts) < data.Order {
continue
}
if parts[0] == swLower {
matches = append(matches, k)
}
}
if len(matches) > 0 {
candidates = matches
break
}
}
}
if len(candidates) > 0 {
currentKey = candidates[rand.Intn(len(candidates))]
}
}
if currentKey == "" {
currentKey = data.Starts[rand.Intn(len(data.Starts))]
}
output := strings.Fields(currentKey)
for i := 0; i < 40; i++ {
nextOptions, exists := data.Chain[currentKey]
if !exists || len(nextOptions) == 0 {
break
}
nextWord := nextOptions[rand.Intn(len(nextOptions))]
output = append(output, nextWord)
// Shift the key window
currentWords := strings.Fields(currentKey)
if len(currentWords) >= 1 {
newKeyWords := append(currentWords[1:], nextWord)
currentKey = Key(newKeyWords...)
} else {
break
}
// Soft stop on punctuation
if i > 5 && strings.ContainsAny(nextWord, ".!?") {
if rand.Float32() > 0.3 {
break
}
}
}
return strings.Join(output, " ")
}
func Key(words ...string) string {
return strings.Join(words, " ")
}

16
main.go
View File

@@ -29,6 +29,10 @@ func main() {
initCommands(config)
initCommandHandlers(config)
if err := command.InitBard("datasets/bard.gob"); err != nil {
log.Printf("Failed to load Bard dataset: %v", err)
}
err := lib.InitDB()
if err != nil {
log.Fatalf("Failed to initialize database: %v", err)
@@ -191,13 +195,13 @@ func initCommands(config *lib.Config) {
},
},
{
Name: "gen",
Description: "Generate a random message using markov chains based on channel history",
Name: "bard",
Description: "Ask the bard a question",
Options: []*discordgo.ApplicationCommandOption{
{
Type: discordgo.ApplicationCommandOptionInteger,
Name: "messages",
Description: fmt.Sprintf("Number of messages to use (default: %d, max: %d)", config.MarkovDefaultMessages, config.MarkovMaxMessages),
Type: discordgo.ApplicationCommandOptionString,
Name: "question",
Description: "The question you want to ask",
Required: false,
},
},
@@ -255,7 +259,7 @@ func initCommandHandlers(config *lib.Config) {
commandHandlers = map[string]func(s *discordgo.Session, i *discordgo.InteractionCreate){
"ping": lib.HandleCommand("ping", time.Duration(config.PingCooldown)*time.Second, command.PingCommand),
"hs": lib.HandleCommand("hs", time.Duration(config.HsCooldown)*time.Second, command.HsCommand),
"gen": lib.HandleCommand("gen", time.Duration(config.MarkovCooldown)*time.Second, command.MarkovCommand),
"bard": lib.HandleCommand("bard", time.Duration(config.MarkovCooldown)*time.Second, command.BardCommand),
"ask": lib.HandleCommand("ask", time.Duration(config.MarkovAskCooldown)*time.Second, command.MarkovQuestionCommand),
"himbucks": lib.HandleCommand("himbucks", time.Duration(config.HimbucksCooldown)*time.Second, command.BalanceGetCommand),
"himboard": lib.HandleCommand("himboard", time.Duration(config.HimboardCooldown)*time.Second, command.LeaderboardCommand),