Trying some weird training nonsense. Maybe this will be fun.

2026-01-20 14:48:53 -07:00
parent 0081978489
commit 9694a42f3f
49 changed files with 190186 additions and 159 deletions
@@ -28,5 +28,8 @@ COPY --from=build /go/bin/app /app/himbot
 # Copy migrations directory
 COPY --from=build /app/migrations /app/migrations

+# Copy datasets directory
+COPY --from=build /app/datasets /app/datasets
+
 # Set the entrypoint
 ENTRYPOINT ["/app/himbot"]
@@ -0,0 +1,66 @@
+package main
+
+import (
+	"encoding/gob"
+	"flag"
+	"himbot/lib"
+	"log"
+	"os"
+	"path/filepath"
+	"strings"
+)
+
+func main() {
+	inputDir := flag.String("input", "datasets/bard", "Directory containing text files to train on")
+	outputFile := flag.String("output", "datasets/bard.gob", "Output file path for the pre-trained model")
+	order := flag.Int("order", 3, "Markov chain order (N-gram size)")
+	flag.Parse()
+
+	log.Printf("Scanning directory: %s", *inputDir)
+
+	var allLines []string
+	fileCount := 0
+
+	err := filepath.Walk(*inputDir, func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		if !info.IsDir() && strings.HasSuffix(info.Name(), ".txt") {
+			content, err := os.ReadFile(path)
+			if err != nil {
+				log.Printf("Error reading file %s: %v", path, err)
+				return nil // Continue to next file
+			}
+			lines := strings.Split(string(content), "\n")
+			allLines = append(allLines, lines...)
+			fileCount++
+			if fileCount%5 == 0 {
+				log.Printf("Processed %d files...", fileCount)
+			}
+		}
+		return nil
+	})
+
+	if err != nil {
+		log.Fatalf("Error walking directory: %v", err)
+	}
+
+	log.Printf("Found %d files with %d total lines. Building Markov chain...", fileCount, len(allLines))
+
+	chain := lib.BuildMarkovChain(allLines, *order)
+
+	log.Printf("Chain built with %d start keys. Saving to %s...", len(chain.Starts), *outputFile)
+
+	f, err := os.Create(*outputFile)
+	if err != nil {
+		log.Fatalf("Failed to create output file: %v", err)
+	}
+	defer f.Close()
+
+	encoder := gob.NewEncoder(f)
+	if err := encoder.Encode(chain); err != nil {
+		log.Fatalf("Failed to encode chain: %v", err)
+	}
+
+	log.Println("Done!")
+}
@@ -2,73 +2,69 @@ package command

 import (
 	"crypto/md5"
+	"encoding/gob"
 	"fmt"
 	"himbot/lib"
-	"math/rand"
-	"regexp"
+	"os"
 	"strings"
 	"sync"
-	"time"

 	"github.com/bwmarrin/discordgo"
 )

-type MarkovData struct {
-	Chain  map[string][]string // "word1 word2" -> ["word3", ...]
-	Starts []string
-}
-
 type MarkovCache struct {
-	data   map[string]*MarkovData
+	data   map[string]*lib.MarkovData
 	hashes map[string]string
 	mu     sync.RWMutex
 }

 var (
 	markovCache = &MarkovCache{
-		data:   make(map[string]*MarkovData),
+		data:   make(map[string]*lib.MarkovData),
 		hashes: make(map[string]string),
 	}
-	urlRegex     = regexp.MustCompile(`https?://[^\s]+`)
-	mentionRegex = regexp.MustCompile(`<[@#&!][^>]+>`)
+	bardChain *lib.MarkovData
 )

-func MarkovCommand(s *discordgo.Session, i *discordgo.InteractionCreate) (string, error) {
-	channelID := i.ChannelID
-
-	numMessages := lib.AppConfig.MarkovDefaultMessages
-	if len(i.ApplicationCommandData().Options) > 0 {
-		if i.ApplicationCommandData().Options[0].Name == "messages" {
-			numMessages = int(i.ApplicationCommandData().Options[0].IntValue())
-			if numMessages <= 0 {
-				numMessages = lib.AppConfig.MarkovDefaultMessages
-			} else if numMessages > lib.AppConfig.MarkovMaxMessages {
-				numMessages = lib.AppConfig.MarkovMaxMessages
-			}
-		}
-	}
-
-	cacheKey := fmt.Sprintf("%s:%d", channelID, numMessages)
-	if data := getCachedChain(cacheKey); data != nil {
-		if msg := generateMessage(data, ""); msg != "" {
-			return msg, nil
-		}
-	}
-
-	allMessages, err := fetchMessages(s, channelID, numMessages)
+func InitBard(modelPath string) error {
+	f, err := os.Open(modelPath)
 	if err != nil {
-		return "", err
+		return err
+	}
+	defer f.Close()
+
+	var data lib.MarkovData
+	decoder := gob.NewDecoder(f)
+	if err := decoder.Decode(&data); err != nil {
+		return err
 	}

-	data := buildMarkovChain(allMessages)
-	setCachedChain(cacheKey, data, allMessages)
+	bardChain = &data
+	return nil
+}

-	newMessage := generateMessage(data, "")
-	if newMessage == "" {
-		newMessage = "Not enough text data to generate a message."
+func BardCommand(s *discordgo.Session, i *discordgo.InteractionCreate) (string, error) {
+	if bardChain == nil {
+		return "The bard is sleeping (dataset not loaded).", nil
 	}

-	return newMessage, nil
+	var question string
+	for _, option := range i.ApplicationCommandData().Options {
+		if option.Name == "question" {
+			question = option.StringValue()
+		}
+	}
+
+	answer := lib.GenerateMessage(bardChain, question)
+	if answer == "" {
+		answer = "Words fail me."
+	}
+
+	if question != "" {
+		return fmt.Sprintf("**Q:** %s\n**A:** %s", question, answer), nil
+	}
+
+	return answer, nil
 }

 func MarkovQuestionCommand(s *discordgo.Session, i *discordgo.InteractionCreate) (string, error) {
@@ -95,7 +91,7 @@ func MarkovQuestionCommand(s *discordgo.Session, i *discordgo.InteractionCreate)
 	}

 	cacheKey := fmt.Sprintf("%s:%d", channelID, numMessages)
-	var data *MarkovData
+	var data *lib.MarkovData

 	if cachedData := getCachedChain(cacheKey); cachedData != nil {
 		data = cachedData
@@ -104,11 +100,18 @@ func MarkovQuestionCommand(s *discordgo.Session, i *discordgo.InteractionCreate)
 		if err != nil {
 			return "", err
 		}
-		data = buildMarkovChain(allMessages)
-		setCachedChain(cacheKey, data, allMessages)
+
+		var texts []string
+		for _, msg := range allMessages {
+			texts = append(texts, msg.Content)
+		}
+
+		// Use order 2 for chat history (sparse data)
+		data = lib.BuildMarkovChain(texts, 2)
+		setCachedChain(cacheKey, data, hashMessages(allMessages))
 	}

-	answer := generateMessage(data, question)
+	answer := lib.GenerateMessage(data, question)
 	if answer == "" {
 		answer = "I don't have enough context to answer that."
 	}
@@ -116,15 +119,13 @@ func MarkovQuestionCommand(s *discordgo.Session, i *discordgo.InteractionCreate)
 	return fmt.Sprintf("**Q:** %s\n**A:** %s", question, answer), nil
 }

-func getCachedChain(cacheKey string) *MarkovData {
+func getCachedChain(cacheKey string) *lib.MarkovData {
 	markovCache.mu.RLock()
 	defer markovCache.mu.RUnlock()
 	return markovCache.data[cacheKey]
 }

-func setCachedChain(cacheKey string, data *MarkovData, messages []*discordgo.Message) {
-	hash := hashMessages(messages)
-
+func setCachedChain(cacheKey string, data *lib.MarkovData, hash string) {
 	markovCache.mu.Lock()
 	defer markovCache.mu.Unlock()

@@ -186,106 +187,3 @@ func fetchMessages(s *discordgo.Session, channelID string, numMessages int) ([]*

 	return allMessages, nil
 }
-
-func cleanText(text string) string {
-	text = urlRegex.ReplaceAllString(text, "")
-	text = mentionRegex.ReplaceAllString(text, "")
-	return strings.Join(strings.Fields(text), " ")
-}
-
-func buildMarkovChain(messages []*discordgo.Message) *MarkovData {
-	data := &MarkovData{
-		Chain:  make(map[string][]string),
-		Starts: make([]string, 0),
-	}
-
-	for _, msg := range messages {
-		cleaned := cleanText(msg.Content)
-		if cleaned == "" {
-			continue
-		}
-
-		words := strings.Fields(cleaned)
-		if len(words) < 3 {
-			continue
-		}
-
-		startKey := key(words[0], words[1])
-		data.Starts = append(data.Starts, startKey)
-
-		for i := 0; i < len(words)-2; i++ {
-			k := key(words[i], words[i+1])
-			val := words[i+2]
-			data.Chain[k] = append(data.Chain[k], val)
-		}
-	}
-
-	return data
-}
-
-func generateMessage(data *MarkovData, seed string) string {
-	if len(data.Starts) == 0 {
-		return ""
-	}
-
-	var w1, w2 string
-	var currentKey string
-
-	// Try to seed based on input question
-	if seed != "" {
-		seedWords := strings.Fields(cleanText(seed))
-		var candidates []string
-
-		for k := range data.Chain {
-			for _, sw := range seedWords {
-				if len(sw) > 3 && strings.Contains(strings.ToLower(k), strings.ToLower(sw)) {
-					candidates = append(candidates, k)
-				}
-			}
-		}
-
-		if len(candidates) > 0 {
-			currentKey = candidates[rand.Intn(len(candidates))]
-		}
-	}
-
-	if currentKey == "" {
-		currentKey = data.Starts[rand.Intn(len(data.Starts))]
-	}
-
-	parts := strings.Split(currentKey, " ")
-	w1, w2 = parts[0], parts[1]
-
-	output := []string{w1, w2}
-
-	for i := 0; i < 40; i++ {
-		nextOptions, exists := data.Chain[currentKey]
-		if !exists || len(nextOptions) == 0 {
-			break
-		}
-
-		nextWord := nextOptions[rand.Intn(len(nextOptions))]
-		output = append(output, nextWord)
-
-		w1 = w2
-		w2 = nextWord
-		currentKey = key(w1, w2)
-
-		// Soft stop on punctuation
-		if i > 5 && strings.ContainsAny(nextWord, ".!?") {
-			if rand.Float32() > 0.3 {
-				break
-			}
-		}
-	}
-
-	return strings.Join(output, " ")
-}
-
-func key(w1, w2 string) string {
-	return w1 + " " + w2
-}
-
-func init() {
-	rand.Seed(time.Now().UnixNano())
-}
@@ -0,0 +1,102 @@
+The Phoenix and Turtle
+by William Shakespeare
+Edited by Barbara A. Mowat and Paul Werstine
+  with Michael Poston and Rebecca Niles
+Folger Shakespeare Library
+https://shakespeare.folger.edu/shakespeares-works/the-phoenix-and-turtle/
+Created on Jul 31, 2015, from FDT version 0.9.0.1
+
+
+"The Phoenix and Turtle"
+
+
+Let the bird of loudest lay
+On the sole Arabian tree
+Herald sad and trumpet be,
+To whose sound chaste wings obey.
+
+But thou shrieking harbinger,
+Foul precurrer of the fiend,
+Augur of the fever's end,
+To this troop come thou not near.
+
+From this session interdict
+Every fowl of tyrant wing,
+Save the eagle, feathered king;
+Keep the obsequy so strict.
+
+Let the priest in surplice white,
+That defunctive music can,
+Be the death-divining swan,
+Lest the requiem lack his right.
+
+And thou treble-dated crow,
+That thy sable gender mak'st
+With the breath thou giv'st and tak'st,
+'Mongst our mourners shalt thou go.
+
+Here the anthem doth commence:
+Love and constancy is dead,
+Phoenix and the turtle fled
+In a mutual flame from hence.
+
+So they loved, as love in twain
+Had the essence but in one,
+Two distincts, division none;
+Number there in love was slain.
+
+Hearts remote yet not asunder,
+Distance and no space was seen
+'Twixt this turtle and his queen;
+But in them it were a wonder.
+
+So between them love did shine
+That the turtle saw his right
+Flaming in the phoenix' sight;
+Either was the other's mine.
+
+Property was thus appalled
+That the self was not the same;
+Single nature's double name
+Neither two nor one was called.
+
+Reason, in itself confounded,
+Saw division grow together,
+To themselves yet either neither,
+Simple were so well compounded
+
+That it cried, "How true a twain
+Seemeth this concordant one!
+Love hath reason, Reason none,
+If what parts can so remain,"
+
+Whereupon it made this threne
+To the phoenix and the dove,
+Co-supremes and stars of love,
+As chorus to their tragic scene.
+
+
+Threnos
+
+
+Beauty, truth, and rarity,
+Grace in all simplicity,
+Here enclosed, in cinders lie.
+
+Death is now the phoenix' nest,
+And the turtle's loyal breast
+To eternity doth rest,
+
+Leaving no posterity;
+'Twas not their infirmity,
+It was married chastity.
+
+Truth may seem, but cannot be;
+Beauty brag, but 'tis not she;
+Truth and beauty buried be.
+
+To this urn let those repair
+That are either true or fair;
+For these dead birds sigh a prayer.
+
+William Shakespeare
@@ -0,0 +1,262 @@
+package lib
+
+import (
+	"math/rand"
+	"regexp"
+	"strings"
+	"time"
+)
+
+type MarkovData struct {
+	Order  int
+	Chain  map[string][]string // "word1 ... wordN" -> ["word3", ...]
+	Starts []string
+}
+
+var (
+	urlRegex     = regexp.MustCompile(`https?://[^\s]+`)
+	mentionRegex = regexp.MustCompile(`<[@#&!][^>]+>`)
+	bracketRegex = regexp.MustCompile(`\[.*?\]`)
+	speakerRegex = regexp.MustCompile(`^(?:[A-Z]{2,}\s+)+`)
+	stopWords    = map[string]bool{
+		"the": true, "and": true, "a": true, "to": true, "of": true,
+		"in": true, "is": true, "that": true, "it": true, "for": true,
+		"as": true, "with": true, "on": true, "at": true, "by": true,
+		"this": true, "from": true, "but": true, "or": true, "an": true,
+		"be": true, "are": true, "was": true, "were": true, "so": true,
+		"if": true, "out": true, "up": true, "about": true, "into": true,
+		"over": true, "after": true, "beneath": true, "under": true,
+		"above": true, "me": true, "my": true, "mine": true, "you": true,
+		"your": true, "yours": true, "he": true, "him": true, "his": true,
+		"she": true, "her": true, "hers": true, "they": true, "them": true,
+		"their": true, "theirs": true, "we": true, "us": true, "our": true,
+		"ours": true, "who": true, "whom": true, "whose": true, "what": true,
+		"which": true, "when": true, "where": true, "why": true, "how": true,
+		"give": true, "write": true, "tell": true, "say": true, "speak": true,
+		"make": true, "do": true, "does": true, "did": true, "done": true,
+	}
+)
+
+func init() {
+	rand.Seed(time.Now().UnixNano())
+}
+
+func CleanText(text string) string {
+	text = urlRegex.ReplaceAllString(text, "")
+	text = mentionRegex.ReplaceAllString(text, "")
+	text = bracketRegex.ReplaceAllString(text, "")
+	text = strings.TrimSpace(text)
+	text = speakerRegex.ReplaceAllString(text, "")
+	return strings.Join(strings.Fields(text), " ")
+}
+
+func BuildMarkovChain(lines []string, order int) *MarkovData {
+	data := &MarkovData{
+		Order:  order,
+		Chain:  make(map[string][]string),
+		Starts: make([]string, 0),
+	}
+
+	var allWords []string
+
+	for _, line := range lines {
+		// Skip likely headers/metadata (all caps lines)
+		trimmed := strings.TrimSpace(line)
+		if trimmed != "" && strings.ToUpper(trimmed) == trimmed && strings.ToLower(trimmed) != trimmed {
+			continue
+		}
+
+		cleaned := CleanText(line)
+		if cleaned == "" {
+			continue
+		}
+
+		allWords = append(allWords, strings.Fields(cleaned)...)
+	}
+
+	if len(allWords) < order+1 {
+		return data
+	}
+
+	// First key is always a start
+	data.Starts = append(data.Starts, Key(allWords[:order]...))
+
+	for i := 0; i < len(allWords)-order; i++ {
+		keyWords := allWords[i : i+order]
+		nextWord := allWords[i+order]
+
+		k := Key(keyWords...)
+		data.Chain[k] = append(data.Chain[k], nextWord)
+
+		// If the word shifting out ends a sentence, the next sequence is a start
+		if strings.ContainsAny(allWords[i], ".!?") {
+			if i+1+order <= len(allWords) {
+				data.Starts = append(data.Starts, Key(allWords[i+1:i+1+order]...))
+			}
+		}
+	}
+
+	return data
+}
+
+func GenerateMessage(data *MarkovData, seed string) string {
+	if len(data.Starts) == 0 {
+		return ""
+	}
+
+	var currentKey string
+
+	// Try to seed based on input question
+	if seed != "" {
+		seedWords := strings.Fields(CleanText(seed))
+
+		// Sort seed words: significant words first, then by length
+		for i := 0; i < len(seedWords); i++ {
+			for j := i + 1; j < len(seedWords); j++ {
+				sw1 := strings.ToLower(seedWords[i])
+				sw2 := strings.ToLower(seedWords[j])
+				isStop1 := stopWords[sw1]
+				isStop2 := stopWords[sw2]
+
+				// If one is a stop word and the other isn't, prioritize the non-stop word
+				if isStop1 && !isStop2 {
+					seedWords[i], seedWords[j] = seedWords[j], seedWords[i]
+				} else if !isStop1 && isStop2 {
+					continue
+				} else {
+					// Otherwise sort by length
+					if len(seedWords[i]) < len(seedWords[j]) {
+						seedWords[i], seedWords[j] = seedWords[j], seedWords[i]
+					}
+				}
+			}
+		}
+
+		var candidates []string
+
+		// 1. Try to find a sentence starter
+		// We iterate seed words first to prioritize matches for longer words
+		for _, sw := range seedWords {
+			if len(sw) <= 2 {
+				continue
+			}
+			swLower := strings.ToLower(sw)
+			var primaryMatches []string // starts with word
+
+			for _, startKey := range data.Starts {
+				parts := strings.Fields(strings.ToLower(startKey))
+				if len(parts) < data.Order {
+					continue
+				}
+				if parts[0] == swLower {
+					primaryMatches = append(primaryMatches, startKey)
+				}
+			}
+
+			// If we found sentence starters beginning with this word, use them exclusively
+			if len(primaryMatches) > 0 {
+				candidates = primaryMatches
+				break
+			}
+		}
+
+		// 2. If no perfect starts, try any start containing the word
+		if len(candidates) == 0 {
+			for _, sw := range seedWords {
+				if len(sw) <= 2 {
+					continue
+				}
+				swLower := strings.ToLower(sw)
+
+				for _, startKey := range data.Starts {
+					parts := strings.Fields(strings.ToLower(startKey))
+					if len(parts) < data.Order {
+						continue
+					}
+					// Check remaining words in key
+					found := false
+					for i := 1; i < len(parts); i++ {
+						if parts[i] == swLower {
+							found = true
+							break
+						}
+					}
+					if found {
+						candidates = append(candidates, startKey)
+					}
+				}
+				if len(candidates) > 0 {
+					break
+				}
+			}
+		}
+
+		// 3. If no starts, try to find any connection in the chain
+		if len(candidates) == 0 {
+			for _, sw := range seedWords {
+				if len(sw) <= 2 {
+					continue
+				}
+				swLower := strings.ToLower(sw)
+				var matches []string
+
+				for k := range data.Chain {
+					parts := strings.Fields(strings.ToLower(k))
+					if len(parts) < data.Order {
+						continue
+					}
+					if parts[0] == swLower {
+						matches = append(matches, k)
+					}
+				}
+
+				if len(matches) > 0 {
+					candidates = matches
+					break
+				}
+			}
+		}
+
+		if len(candidates) > 0 {
+			currentKey = candidates[rand.Intn(len(candidates))]
+		}
+	}
+
+	if currentKey == "" {
+		currentKey = data.Starts[rand.Intn(len(data.Starts))]
+	}
+
+	output := strings.Fields(currentKey)
+
+	for i := 0; i < 40; i++ {
+		nextOptions, exists := data.Chain[currentKey]
+		if !exists || len(nextOptions) == 0 {
+			break
+		}
+
+		nextWord := nextOptions[rand.Intn(len(nextOptions))]
+		output = append(output, nextWord)
+
+		// Shift the key window
+		currentWords := strings.Fields(currentKey)
+		if len(currentWords) >= 1 {
+			newKeyWords := append(currentWords[1:], nextWord)
+			currentKey = Key(newKeyWords...)
+		} else {
+			break
+		}
+
+		// Soft stop on punctuation
+		if i > 5 && strings.ContainsAny(nextWord, ".!?") {
+			if rand.Float32() > 0.3 {
+				break
+			}
+		}
+	}
+
+	return strings.Join(output, " ")
+}
+
+func Key(words ...string) string {
+	return strings.Join(words, " ")
+}
@@ -29,6 +29,10 @@ func main() {
 	initCommands(config)
 	initCommandHandlers(config)

+	if err := command.InitBard("datasets/bard.gob"); err != nil {
+		log.Printf("Failed to load Bard dataset: %v", err)
+	}
+
 	err := lib.InitDB()
 	if err != nil {
 		log.Fatalf("Failed to initialize database: %v", err)
@@ -191,13 +195,13 @@ func initCommands(config *lib.Config) {
 			},
 		},
 		{
-			Name:        "gen",
-			Description: "Generate a random message using markov chains based on channel history",
+			Name:        "bard",
+			Description: "Ask the bard a question",
 			Options: []*discordgo.ApplicationCommandOption{
 				{
-					Type:        discordgo.ApplicationCommandOptionInteger,
-					Name:        "messages",
-					Description: fmt.Sprintf("Number of messages to use (default: %d, max: %d)", config.MarkovDefaultMessages, config.MarkovMaxMessages),
+					Type:        discordgo.ApplicationCommandOptionString,
+					Name:        "question",
+					Description: "The question you want to ask",
 					Required:    false,
 				},
 			},
@@ -255,7 +259,7 @@ func initCommandHandlers(config *lib.Config) {
 	commandHandlers = map[string]func(s *discordgo.Session, i *discordgo.InteractionCreate){
 		"ping":     lib.HandleCommand("ping", time.Duration(config.PingCooldown)*time.Second, command.PingCommand),
 		"hs":       lib.HandleCommand("hs", time.Duration(config.HsCooldown)*time.Second, command.HsCommand),
-		"gen":      lib.HandleCommand("gen", time.Duration(config.MarkovCooldown)*time.Second, command.MarkovCommand),
+		"bard":     lib.HandleCommand("bard", time.Duration(config.MarkovCooldown)*time.Second, command.BardCommand),
 		"ask":      lib.HandleCommand("ask", time.Duration(config.MarkovAskCooldown)*time.Second, command.MarkovQuestionCommand),
 		"himbucks": lib.HandleCommand("himbucks", time.Duration(config.HimbucksCooldown)*time.Second, command.BalanceGetCommand),
 		"himboard": lib.HandleCommand("himboard", time.Duration(config.HimboardCooldown)*time.Second, command.LeaderboardCommand),