All checks were successful
Docker Deploy / build-and-push (push) Successful in 3m23s
263 lines
6.5 KiB
Go
263 lines
6.5 KiB
Go
package lib
|
|
|
|
import (
|
|
"math/rand"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
type MarkovData struct {
|
|
Order int
|
|
Chain map[string][]string // "word1 ... wordN" -> ["word3", ...]
|
|
Starts []string
|
|
}
|
|
|
|
var (
|
|
urlRegex = regexp.MustCompile(`https?://[^\s]+`)
|
|
mentionRegex = regexp.MustCompile(`<[@#&!][^>]+>`)
|
|
bracketRegex = regexp.MustCompile(`\[.*?\]`)
|
|
speakerRegex = regexp.MustCompile(`^(?:[A-Z]{2,}\s+)+`)
|
|
stopWords = map[string]bool{
|
|
"the": true, "and": true, "a": true, "to": true, "of": true,
|
|
"in": true, "is": true, "that": true, "it": true, "for": true,
|
|
"as": true, "with": true, "on": true, "at": true, "by": true,
|
|
"this": true, "from": true, "but": true, "or": true, "an": true,
|
|
"be": true, "are": true, "was": true, "were": true, "so": true,
|
|
"if": true, "out": true, "up": true, "about": true, "into": true,
|
|
"over": true, "after": true, "beneath": true, "under": true,
|
|
"above": true, "me": true, "my": true, "mine": true, "you": true,
|
|
"your": true, "yours": true, "he": true, "him": true, "his": true,
|
|
"she": true, "her": true, "hers": true, "they": true, "them": true,
|
|
"their": true, "theirs": true, "we": true, "us": true, "our": true,
|
|
"ours": true, "who": true, "whom": true, "whose": true, "what": true,
|
|
"which": true, "when": true, "where": true, "why": true, "how": true,
|
|
"give": true, "write": true, "tell": true, "say": true, "speak": true,
|
|
"make": true, "do": true, "does": true, "did": true, "done": true,
|
|
}
|
|
)
|
|
|
|
func init() {
|
|
rand.Seed(time.Now().UnixNano())
|
|
}
|
|
|
|
func CleanText(text string) string {
|
|
text = urlRegex.ReplaceAllString(text, "")
|
|
text = mentionRegex.ReplaceAllString(text, "")
|
|
text = bracketRegex.ReplaceAllString(text, "")
|
|
text = strings.TrimSpace(text)
|
|
text = speakerRegex.ReplaceAllString(text, "")
|
|
return strings.Join(strings.Fields(text), " ")
|
|
}
|
|
|
|
func BuildMarkovChain(lines []string, order int) *MarkovData {
|
|
data := &MarkovData{
|
|
Order: order,
|
|
Chain: make(map[string][]string),
|
|
Starts: make([]string, 0),
|
|
}
|
|
|
|
var allWords []string
|
|
|
|
for _, line := range lines {
|
|
// Skip likely headers/metadata (all caps lines)
|
|
trimmed := strings.TrimSpace(line)
|
|
if trimmed != "" && strings.ToUpper(trimmed) == trimmed && strings.ToLower(trimmed) != trimmed {
|
|
continue
|
|
}
|
|
|
|
cleaned := CleanText(line)
|
|
if cleaned == "" {
|
|
continue
|
|
}
|
|
|
|
allWords = append(allWords, strings.Fields(cleaned)...)
|
|
}
|
|
|
|
if len(allWords) < order+1 {
|
|
return data
|
|
}
|
|
|
|
// First key is always a start
|
|
data.Starts = append(data.Starts, Key(allWords[:order]...))
|
|
|
|
for i := 0; i < len(allWords)-order; i++ {
|
|
keyWords := allWords[i : i+order]
|
|
nextWord := allWords[i+order]
|
|
|
|
k := Key(keyWords...)
|
|
data.Chain[k] = append(data.Chain[k], nextWord)
|
|
|
|
// If the word shifting out ends a sentence, the next sequence is a start
|
|
if strings.ContainsAny(allWords[i], ".!?") {
|
|
if i+1+order <= len(allWords) {
|
|
data.Starts = append(data.Starts, Key(allWords[i+1:i+1+order]...))
|
|
}
|
|
}
|
|
}
|
|
|
|
return data
|
|
}
|
|
|
|
func GenerateMessage(data *MarkovData, seed string) string {
|
|
if len(data.Starts) == 0 {
|
|
return ""
|
|
}
|
|
|
|
var currentKey string
|
|
|
|
// Try to seed based on input question
|
|
if seed != "" {
|
|
seedWords := strings.Fields(CleanText(seed))
|
|
|
|
// Sort seed words: significant words first, then by length
|
|
for i := 0; i < len(seedWords); i++ {
|
|
for j := i + 1; j < len(seedWords); j++ {
|
|
sw1 := strings.ToLower(seedWords[i])
|
|
sw2 := strings.ToLower(seedWords[j])
|
|
isStop1 := stopWords[sw1]
|
|
isStop2 := stopWords[sw2]
|
|
|
|
// If one is a stop word and the other isn't, prioritize the non-stop word
|
|
if isStop1 && !isStop2 {
|
|
seedWords[i], seedWords[j] = seedWords[j], seedWords[i]
|
|
} else if !isStop1 && isStop2 {
|
|
continue
|
|
} else {
|
|
// Otherwise sort by length
|
|
if len(seedWords[i]) < len(seedWords[j]) {
|
|
seedWords[i], seedWords[j] = seedWords[j], seedWords[i]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
var candidates []string
|
|
|
|
// 1. Try to find a sentence starter
|
|
// We iterate seed words first to prioritize matches for longer words
|
|
for _, sw := range seedWords {
|
|
if len(sw) <= 2 {
|
|
continue
|
|
}
|
|
swLower := strings.ToLower(sw)
|
|
var primaryMatches []string // starts with word
|
|
|
|
for _, startKey := range data.Starts {
|
|
parts := strings.Fields(strings.ToLower(startKey))
|
|
if len(parts) < data.Order {
|
|
continue
|
|
}
|
|
if parts[0] == swLower {
|
|
primaryMatches = append(primaryMatches, startKey)
|
|
}
|
|
}
|
|
|
|
// If we found sentence starters beginning with this word, use them exclusively
|
|
if len(primaryMatches) > 0 {
|
|
candidates = primaryMatches
|
|
break
|
|
}
|
|
}
|
|
|
|
// 2. If no perfect starts, try any start containing the word
|
|
if len(candidates) == 0 {
|
|
for _, sw := range seedWords {
|
|
if len(sw) <= 2 {
|
|
continue
|
|
}
|
|
swLower := strings.ToLower(sw)
|
|
|
|
for _, startKey := range data.Starts {
|
|
parts := strings.Fields(strings.ToLower(startKey))
|
|
if len(parts) < data.Order {
|
|
continue
|
|
}
|
|
// Check remaining words in key
|
|
found := false
|
|
for i := 1; i < len(parts); i++ {
|
|
if parts[i] == swLower {
|
|
found = true
|
|
break
|
|
}
|
|
}
|
|
if found {
|
|
candidates = append(candidates, startKey)
|
|
}
|
|
}
|
|
if len(candidates) > 0 {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// 3. If no starts, try to find any connection in the chain
|
|
if len(candidates) == 0 {
|
|
for _, sw := range seedWords {
|
|
if len(sw) <= 2 {
|
|
continue
|
|
}
|
|
swLower := strings.ToLower(sw)
|
|
var matches []string
|
|
|
|
for k := range data.Chain {
|
|
parts := strings.Fields(strings.ToLower(k))
|
|
if len(parts) < data.Order {
|
|
continue
|
|
}
|
|
if parts[0] == swLower {
|
|
matches = append(matches, k)
|
|
}
|
|
}
|
|
|
|
if len(matches) > 0 {
|
|
candidates = matches
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(candidates) > 0 {
|
|
currentKey = candidates[rand.Intn(len(candidates))]
|
|
}
|
|
}
|
|
|
|
if currentKey == "" {
|
|
currentKey = data.Starts[rand.Intn(len(data.Starts))]
|
|
}
|
|
|
|
output := strings.Fields(currentKey)
|
|
|
|
for i := 0; i < 40; i++ {
|
|
nextOptions, exists := data.Chain[currentKey]
|
|
if !exists || len(nextOptions) == 0 {
|
|
break
|
|
}
|
|
|
|
nextWord := nextOptions[rand.Intn(len(nextOptions))]
|
|
output = append(output, nextWord)
|
|
|
|
// Shift the key window
|
|
currentWords := strings.Fields(currentKey)
|
|
if len(currentWords) >= 1 {
|
|
newKeyWords := append(currentWords[1:], nextWord)
|
|
currentKey = Key(newKeyWords...)
|
|
} else {
|
|
break
|
|
}
|
|
|
|
// Soft stop on punctuation
|
|
if i > 5 && strings.ContainsAny(nextWord, ".!?") {
|
|
if rand.Float32() > 0.3 {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
return strings.Join(output, " ")
|
|
}
|
|
|
|
func Key(words ...string) string {
|
|
return strings.Join(words, " ")
|
|
}
|