package lib import ( "math/rand" "regexp" "strings" "time" ) type MarkovData struct { Order int Chain map[string][]string // "word1 ... wordN" -> ["word3", ...] Starts []string } var ( urlRegex = regexp.MustCompile(`https?://[^\s]+`) mentionRegex = regexp.MustCompile(`<[@#&!][^>]+>`) bracketRegex = regexp.MustCompile(`\[.*?\]`) speakerRegex = regexp.MustCompile(`^(?:[A-Z]{2,}\s+)+`) stopWords = map[string]bool{ "the": true, "and": true, "a": true, "to": true, "of": true, "in": true, "is": true, "that": true, "it": true, "for": true, "as": true, "with": true, "on": true, "at": true, "by": true, "this": true, "from": true, "but": true, "or": true, "an": true, "be": true, "are": true, "was": true, "were": true, "so": true, "if": true, "out": true, "up": true, "about": true, "into": true, "over": true, "after": true, "beneath": true, "under": true, "above": true, "me": true, "my": true, "mine": true, "you": true, "your": true, "yours": true, "he": true, "him": true, "his": true, "she": true, "her": true, "hers": true, "they": true, "them": true, "their": true, "theirs": true, "we": true, "us": true, "our": true, "ours": true, "who": true, "whom": true, "whose": true, "what": true, "which": true, "when": true, "where": true, "why": true, "how": true, "give": true, "write": true, "tell": true, "say": true, "speak": true, "make": true, "do": true, "does": true, "did": true, "done": true, } ) func init() { rand.Seed(time.Now().UnixNano()) } func CleanText(text string) string { text = urlRegex.ReplaceAllString(text, "") text = mentionRegex.ReplaceAllString(text, "") text = bracketRegex.ReplaceAllString(text, "") text = strings.TrimSpace(text) text = speakerRegex.ReplaceAllString(text, "") return strings.Join(strings.Fields(text), " ") } func BuildMarkovChain(lines []string, order int) *MarkovData { data := &MarkovData{ Order: order, Chain: make(map[string][]string), Starts: make([]string, 0), } var allWords []string for _, line := range lines { // Skip likely headers/metadata (all caps lines) trimmed := strings.TrimSpace(line) if trimmed != "" && strings.ToUpper(trimmed) == trimmed && strings.ToLower(trimmed) != trimmed { continue } cleaned := CleanText(line) if cleaned == "" { continue } allWords = append(allWords, strings.Fields(cleaned)...) } if len(allWords) < order+1 { return data } // First key is always a start data.Starts = append(data.Starts, Key(allWords[:order]...)) for i := 0; i < len(allWords)-order; i++ { keyWords := allWords[i : i+order] nextWord := allWords[i+order] k := Key(keyWords...) data.Chain[k] = append(data.Chain[k], nextWord) // If the word shifting out ends a sentence, the next sequence is a start if strings.ContainsAny(allWords[i], ".!?") { if i+1+order <= len(allWords) { data.Starts = append(data.Starts, Key(allWords[i+1:i+1+order]...)) } } } return data } func GenerateMessage(data *MarkovData, seed string) string { if len(data.Starts) == 0 { return "" } var currentKey string // Try to seed based on input question if seed != "" { seedWords := strings.Fields(CleanText(seed)) // Sort seed words: significant words first, then by length for i := 0; i < len(seedWords); i++ { for j := i + 1; j < len(seedWords); j++ { sw1 := strings.ToLower(seedWords[i]) sw2 := strings.ToLower(seedWords[j]) isStop1 := stopWords[sw1] isStop2 := stopWords[sw2] // If one is a stop word and the other isn't, prioritize the non-stop word if isStop1 && !isStop2 { seedWords[i], seedWords[j] = seedWords[j], seedWords[i] } else if !isStop1 && isStop2 { continue } else { // Otherwise sort by length if len(seedWords[i]) < len(seedWords[j]) { seedWords[i], seedWords[j] = seedWords[j], seedWords[i] } } } } var candidates []string // 1. Try to find a sentence starter // We iterate seed words first to prioritize matches for longer words for _, sw := range seedWords { if len(sw) <= 2 { continue } swLower := strings.ToLower(sw) var primaryMatches []string // starts with word for _, startKey := range data.Starts { parts := strings.Fields(strings.ToLower(startKey)) if len(parts) < data.Order { continue } if parts[0] == swLower { primaryMatches = append(primaryMatches, startKey) } } // If we found sentence starters beginning with this word, use them exclusively if len(primaryMatches) > 0 { candidates = primaryMatches break } } // 2. If no perfect starts, try any start containing the word if len(candidates) == 0 { for _, sw := range seedWords { if len(sw) <= 2 { continue } swLower := strings.ToLower(sw) for _, startKey := range data.Starts { parts := strings.Fields(strings.ToLower(startKey)) if len(parts) < data.Order { continue } // Check remaining words in key found := false for i := 1; i < len(parts); i++ { if parts[i] == swLower { found = true break } } if found { candidates = append(candidates, startKey) } } if len(candidates) > 0 { break } } } // 3. If no starts, try to find any connection in the chain if len(candidates) == 0 { for _, sw := range seedWords { if len(sw) <= 2 { continue } swLower := strings.ToLower(sw) var matches []string for k := range data.Chain { parts := strings.Fields(strings.ToLower(k)) if len(parts) < data.Order { continue } if parts[0] == swLower { matches = append(matches, k) } } if len(matches) > 0 { candidates = matches break } } } if len(candidates) > 0 { currentKey = candidates[rand.Intn(len(candidates))] } } if currentKey == "" { currentKey = data.Starts[rand.Intn(len(data.Starts))] } output := strings.Fields(currentKey) for i := 0; i < 40; i++ { nextOptions, exists := data.Chain[currentKey] if !exists || len(nextOptions) == 0 { break } nextWord := nextOptions[rand.Intn(len(nextOptions))] output = append(output, nextWord) // Shift the key window currentWords := strings.Fields(currentKey) if len(currentWords) >= 1 { newKeyWords := append(currentWords[1:], nextWord) currentKey = Key(newKeyWords...) } else { break } // Soft stop on punctuation if i > 5 && strings.ContainsAny(nextWord, ".!?") { if rand.Float32() > 0.3 { break } } } return strings.Join(output, " ") } func Key(words ...string) string { return strings.Join(words, " ") }