Trying some weird training nonsense. Maybe this will be fun.
All checks were successful
Docker Deploy / build-and-push (push) Successful in 3m23s

This commit is contained in:
2026-01-20 14:48:53 -07:00
parent 0081978489
commit 9694a42f3f
49 changed files with 190186 additions and 159 deletions

262
lib/markov.go Normal file
View File

@@ -0,0 +1,262 @@
package lib
import (
"math/rand"
"regexp"
"strings"
"time"
)
type MarkovData struct {
Order int
Chain map[string][]string // "word1 ... wordN" -> ["word3", ...]
Starts []string
}
var (
urlRegex = regexp.MustCompile(`https?://[^\s]+`)
mentionRegex = regexp.MustCompile(`<[@#&!][^>]+>`)
bracketRegex = regexp.MustCompile(`\[.*?\]`)
speakerRegex = regexp.MustCompile(`^(?:[A-Z]{2,}\s+)+`)
stopWords = map[string]bool{
"the": true, "and": true, "a": true, "to": true, "of": true,
"in": true, "is": true, "that": true, "it": true, "for": true,
"as": true, "with": true, "on": true, "at": true, "by": true,
"this": true, "from": true, "but": true, "or": true, "an": true,
"be": true, "are": true, "was": true, "were": true, "so": true,
"if": true, "out": true, "up": true, "about": true, "into": true,
"over": true, "after": true, "beneath": true, "under": true,
"above": true, "me": true, "my": true, "mine": true, "you": true,
"your": true, "yours": true, "he": true, "him": true, "his": true,
"she": true, "her": true, "hers": true, "they": true, "them": true,
"their": true, "theirs": true, "we": true, "us": true, "our": true,
"ours": true, "who": true, "whom": true, "whose": true, "what": true,
"which": true, "when": true, "where": true, "why": true, "how": true,
"give": true, "write": true, "tell": true, "say": true, "speak": true,
"make": true, "do": true, "does": true, "did": true, "done": true,
}
)
func init() {
rand.Seed(time.Now().UnixNano())
}
func CleanText(text string) string {
text = urlRegex.ReplaceAllString(text, "")
text = mentionRegex.ReplaceAllString(text, "")
text = bracketRegex.ReplaceAllString(text, "")
text = strings.TrimSpace(text)
text = speakerRegex.ReplaceAllString(text, "")
return strings.Join(strings.Fields(text), " ")
}
func BuildMarkovChain(lines []string, order int) *MarkovData {
data := &MarkovData{
Order: order,
Chain: make(map[string][]string),
Starts: make([]string, 0),
}
var allWords []string
for _, line := range lines {
// Skip likely headers/metadata (all caps lines)
trimmed := strings.TrimSpace(line)
if trimmed != "" && strings.ToUpper(trimmed) == trimmed && strings.ToLower(trimmed) != trimmed {
continue
}
cleaned := CleanText(line)
if cleaned == "" {
continue
}
allWords = append(allWords, strings.Fields(cleaned)...)
}
if len(allWords) < order+1 {
return data
}
// First key is always a start
data.Starts = append(data.Starts, Key(allWords[:order]...))
for i := 0; i < len(allWords)-order; i++ {
keyWords := allWords[i : i+order]
nextWord := allWords[i+order]
k := Key(keyWords...)
data.Chain[k] = append(data.Chain[k], nextWord)
// If the word shifting out ends a sentence, the next sequence is a start
if strings.ContainsAny(allWords[i], ".!?") {
if i+1+order <= len(allWords) {
data.Starts = append(data.Starts, Key(allWords[i+1:i+1+order]...))
}
}
}
return data
}
func GenerateMessage(data *MarkovData, seed string) string {
if len(data.Starts) == 0 {
return ""
}
var currentKey string
// Try to seed based on input question
if seed != "" {
seedWords := strings.Fields(CleanText(seed))
// Sort seed words: significant words first, then by length
for i := 0; i < len(seedWords); i++ {
for j := i + 1; j < len(seedWords); j++ {
sw1 := strings.ToLower(seedWords[i])
sw2 := strings.ToLower(seedWords[j])
isStop1 := stopWords[sw1]
isStop2 := stopWords[sw2]
// If one is a stop word and the other isn't, prioritize the non-stop word
if isStop1 && !isStop2 {
seedWords[i], seedWords[j] = seedWords[j], seedWords[i]
} else if !isStop1 && isStop2 {
continue
} else {
// Otherwise sort by length
if len(seedWords[i]) < len(seedWords[j]) {
seedWords[i], seedWords[j] = seedWords[j], seedWords[i]
}
}
}
}
var candidates []string
// 1. Try to find a sentence starter
// We iterate seed words first to prioritize matches for longer words
for _, sw := range seedWords {
if len(sw) <= 2 {
continue
}
swLower := strings.ToLower(sw)
var primaryMatches []string // starts with word
for _, startKey := range data.Starts {
parts := strings.Fields(strings.ToLower(startKey))
if len(parts) < data.Order {
continue
}
if parts[0] == swLower {
primaryMatches = append(primaryMatches, startKey)
}
}
// If we found sentence starters beginning with this word, use them exclusively
if len(primaryMatches) > 0 {
candidates = primaryMatches
break
}
}
// 2. If no perfect starts, try any start containing the word
if len(candidates) == 0 {
for _, sw := range seedWords {
if len(sw) <= 2 {
continue
}
swLower := strings.ToLower(sw)
for _, startKey := range data.Starts {
parts := strings.Fields(strings.ToLower(startKey))
if len(parts) < data.Order {
continue
}
// Check remaining words in key
found := false
for i := 1; i < len(parts); i++ {
if parts[i] == swLower {
found = true
break
}
}
if found {
candidates = append(candidates, startKey)
}
}
if len(candidates) > 0 {
break
}
}
}
// 3. If no starts, try to find any connection in the chain
if len(candidates) == 0 {
for _, sw := range seedWords {
if len(sw) <= 2 {
continue
}
swLower := strings.ToLower(sw)
var matches []string
for k := range data.Chain {
parts := strings.Fields(strings.ToLower(k))
if len(parts) < data.Order {
continue
}
if parts[0] == swLower {
matches = append(matches, k)
}
}
if len(matches) > 0 {
candidates = matches
break
}
}
}
if len(candidates) > 0 {
currentKey = candidates[rand.Intn(len(candidates))]
}
}
if currentKey == "" {
currentKey = data.Starts[rand.Intn(len(data.Starts))]
}
output := strings.Fields(currentKey)
for i := 0; i < 40; i++ {
nextOptions, exists := data.Chain[currentKey]
if !exists || len(nextOptions) == 0 {
break
}
nextWord := nextOptions[rand.Intn(len(nextOptions))]
output = append(output, nextWord)
// Shift the key window
currentWords := strings.Fields(currentKey)
if len(currentWords) >= 1 {
newKeyWords := append(currentWords[1:], nextWord)
currentKey = Key(newKeyWords...)
} else {
break
}
// Soft stop on punctuation
if i > 5 && strings.ContainsAny(nextWord, ".!?") {
if rand.Float32() > 0.3 {
break
}
}
}
return strings.Join(output, " ")
}
func Key(words ...string) string {
return strings.Join(words, " ")
}