### # Copyright (c) 2019 oddluck # All rights reserved. # # ### import supybot.utils as utils from supybot.commands import * import supybot.plugins as plugins import supybot.ircutils as ircutils import supybot.callbacks as callbacks import supybot.ircmsgs as ircmsgs import supybot.log as log import os import csv import time from datetime import datetime import pickle import requests import random import re from nltk.tokenize import sent_tokenize from ftfy import fix_text try: from supybot.i18n import PluginInternationalization _ = PluginInternationalization('RedditBot') except ImportError: # Placeholder that allows to run the plugin on a bot # without the i18n module _ = lambda x: x class RedditBot(callbacks.Plugin): """Generates chat replies using subreddit comments""" threaded = True def __init__(self, irc): self.__parent = super(RedditBot, self) self.__parent.__init__(irc) self.stopwords = self.add_extra_words() self.model = {} self.MATCH_MESSAGE_STRIPNICK = re.compile('^(<[^ ]+> )?(?P.*)$') def add_extra_words(self): """Adds the title and uppercase version of all words to STOP_WORDS. We parse local copies of stop words downloaded from the following repositories: https://github.com/stopwords-iso/stopwords-es https://github.com/stopwords-iso/stopwords-en """ ES_STOPWORDS_FILE = "{}/stopwords-es.txt".format(os.path.dirname(os.path.abspath(__file__))) EN_STOPWORDS_FILE = "{}/stopwords-en.txt".format(os.path.dirname(os.path.abspath(__file__))) STOP_WORDS = set() with open(ES_STOPWORDS_FILE, "r", encoding="utf-8") as temp_file: for word in temp_file.read().splitlines(): STOP_WORDS.add(word) with open(EN_STOPWORDS_FILE, "r", encoding="utf-8") as temp_file: for word in temp_file.read().splitlines(): STOP_WORDS.add(word) extra_words = list() for word in STOP_WORDS: extra_words.append(word.title()) extra_words.append(word.upper()) for word in extra_words: STOP_WORDS.add(word) return STOP_WORDS def read_model(self, file_name): """Loads the specified pickle file. Parameters ---------- file_name : str The location the pickle file. Returns ------- dict The dictionary inside the pickle. """ with open(file_name, "rb") as model_file: return pickle.load(model_file) def get_prefix(self, model): """Get a random prefix that starts in uppercase. Parameters ---------- model : dict The dictionary containing all the pairs and their possible outcomes. Returns ------- str The randomly selected prefix. """ model_keys = list(model.keys()) # We give it a maximum of 10,000 tries. for _ in range(10000): random_prefix = random.choice(model_keys) if random_prefix[0].isupper(): ends_with_punctuation = False stripped_suffix = random_prefix.strip() for char in [".", "?", "!"]: if stripped_suffix[-1] == char: ends_with_punctuation = True break if not ends_with_punctuation: break return random_prefix def get_prefix_with_context(self, model, context): """Get a random prefix that matches the given context. Parameters ---------- model : dict The dictionary containing all the pairs and their possible outcomes. context : str A sentence which will be separated into keywords. Returns ------- str The randomly selected context-aware prefix. """ # Some light cleanup. context = context.replace("?", "").replace("!", "").replace(".", "") context_keywords = context.split() # we remove stop words from the context. # We use reversed() to remove items from the list without affecting the sequence. for word in reversed(context_keywords): if len(word) <= 3 or word in self.stopwords: context_keywords.remove(word) # If our context has no keywords left we return a random prefix. if len(context_keywords) == 0: return self.get_prefix(model) # We are going to sample one prefix for each available keyword and return only one. model_keys = list(model.keys()) random.shuffle(model_keys) sampled_prefixes = list() for word in context_keywords: for prefix in model_keys: if word in prefix or word.lower() in prefix or word.title() in prefix: sampled_prefixes.append(prefix) break # If we don't get any samples we fallback to the random prefix method. if len(sampled_prefixes) == 0: return self.get_prefix(model) else: return random.choice(sampled_prefixes) def generate_comment(self, model, number_of_sentences, initial_prefix, order): """Generates a new comment using the model and an initial prefix. Parameters ---------- model : dict The dictionary containing all the pairs and their possible outcomes. number_of_Sentences : int The maximum number of sentences. initial_prefix : str The word(s) that will start the chain. order : int The number of words in the state, this must match the order number in step2.py Returns ------- str The newly generated text. """ counter = 0 latest_suffix = initial_prefix final_sentence = latest_suffix + " " # We add a maximum sentence length to avoid going infinite in edge cases. for _ in range(500): try: latest_suffix = random.choice(model[latest_suffix]) except: # If we don't get another word we take another one randomly and continue the chain. latest_suffix = self.get_prefix(model) final_sentence += latest_suffix + " " latest_suffix = " ".join(final_sentence.split()[-order:]).strip() for char in [".", "?", "!"]: if latest_suffix[-1] == char: counter += 1 break if counter >= number_of_sentences: break return final_sentence def capsents(self, user_sentences): sents = sent_tokenize(user_sentences) capitalized_sents = [sent.capitalize() for sent in sents] joined_ = ' '.join(capitalized_sents) return joined_ def create_csv(self, subreddit, latest_timestamp=None): """ Downloads the subreddit comments, 500 at a time. Parameters ---------- subreddit : str The subreddit name. latest_timestamp : int The latest comment timestamp. """ base_url = "https://api.pushshift.io/reddit/comment/search/" params = {"subreddit": subreddit, "sort": "desc", "sort_type": "created_utc", "size": 500, "user_removed": False, "mod_removed": False} # After the first call of this function we will use the 'before' parameter. if latest_timestamp != None: params["before"] = latest_timestamp with requests.get(base_url, params=params) as response: data = response.json() total_posts = len(data["data"]) for item in data["data"]: # We will only take 3 properties, the timestamp, subreddit and comment body. self.latest_timestamp = item["created_utc"] # We clean the greater-than and less-than and zero-width html code. body = fix_text(item["body"]) body = re.sub("$|$|\[|\]|\{|\}|\*|\"", "", body) body = self.capsents(body) self.comments_list.append( [body]) del data return self.comments_list def create_model(self, subreddits): """Reads the specified .csv file(s) and creates a training model from them. It is important to note that we merge all comments into a big string. This is to broaden the number of outcomes. """ for csv_file in subreddits: # We iterate the .csv row by row. for row in csv.reader(open("{0}/data/{1}.csv".format(os.path.dirname(os.path.abspath(__file__)), csv_file.lower()), "r")): # Remove unnecessary whitespaces. row = row[0].strip() # We skip empty comments. if len(row) == 0: continue # To improve results we ensure all comments end with a period. ends_with_punctuation = False for char in [".", "?", "!"]: if row[-1] == char: ends_with_punctuation = True break if not ends_with_punctuation: row = row + "." if len(self.allowed_subreddits) == 0: self.comments_list.append(row) else: # We check if the subreddit comment is in our allowed subreddits list. if row["subreddit"].lower() in self.allowed_subreddits: self.comments_list.append(row) # We separate each comment into words. words_list = " ".join(self.comments_list).split() for index, _ in enumerate(words_list): # This will always fail in the last word since it doesn't have anything to pair it with. try: prefix = " ".join(words_list[index:index+self.order]) suffix = words_list[index+self.order] # If the word is not in the dictionary, we init it with the next word. if prefix not in self.word_dictionary.keys(): self.word_dictionary[prefix] = list([suffix]) else: # Otherwise we append it to its inner list of outcomes. self.word_dictionary[prefix].append(suffix) except: pass del words_list return self.word_dictionary def doPrivmsg(self, irc, msg): (channel, message) = msg.args channel = channel.lower() if callbacks.addressed(irc.nick, msg) or ircmsgs.isCtcp(msg) or not irc.isChannel(channel) or not self.registryValue('enable', channel): return if msg.nick.lower() in self.registryValue('ignoreNicks', channel): log.debug("RedditBot: nick %s in ignoreNicks for %s" % (msg.nick, channel)) return if irc.nick.lower() in message.lower(): # Were we addressed in the channel? message = re.sub(re.escape(irc.nick), '', message, re.IGNORECASE) probability = self.registryValue('probabilityWhenAddressed', channel) else: # Okay, we were not addressed, but what's the probability we should reply? probability = self.registryValue('probability', channel) #if self.registryValue('stripNicks'): # removenicks = '|'.join(item + '\W.*?\s' for item in irc.state.channels[channel].users) # text = re.sub(r'' + removenicks + '', 'MAGIC_NICK', text) message = self.processText(channel, message) # Run text ignores/strips/cleanup. if message and random.random() < probability and os.path.exists("{0}/data/{1}.pickle".format(os.path.dirname(os.path.abspath(__file__)), channel)): try: new_comment = self.generate_comment(model=self.model[channel], order=2, number_of_sentences=2, initial_prefix=self.get_prefix_with_context(self.model[channel], message)) except KeyError: self.model[channel] = self.read_model("{0}/data/{1}.pickle".format(os.path.dirname(os.path.abspath(__file__)), channel)) new_comment = self.generate_comment(model=self.model[channel], order=2, number_of_sentences=2, initial_prefix=self.get_prefix_with_context(self.model[channel], message)) except: return new_comment = self.capsents(new_comment) if new_comment and len(new_comment) > 1 and not new_comment.isspace(): irc.reply(new_comment, prefixNick=False) def processText(self, channel, text): match = False ignore = self.registryValue("ignorePattern", channel) strip = self.registryValue("stripPattern", channel) text = ircutils.stripFormatting(text) text = fix_text(text) if self.registryValue('stripRelayedNick', channel): text = self.MATCH_MESSAGE_STRIPNICK.match(text).group('message') if ignore: match = re.search(ignore, text) if match: log.debug("RedditBot: %s matches ignorePattern for %s" % (text, channel)) return if strip: match = re.findall(strip, text) if match: for x in match: text = text.replace(x, '') log.debug("RedditBot: %s matches stripPattern for %s. New text text: %s" % (x, channel, text)) if self.registryValue('stripURL', channel): new_text = re.sub(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|$([^\s()<>]+|(\([^\s()<>]+$))*\))+(?:$([^\s()<>]+|(\([^\s()<>]+$))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', text) if new_text != text: log.debug("RedditBot: url(s) stripped from text for %s. New text text: %s" % (channel, new_text)) text = new_text text = text.strip() # Strip whitespace from beginning and the end of the string. text = utils.str.normalizeWhitespace(text) # Normalize the whitespace in the string. text = self.capsents(text) if text and len(text) > 1 and not text.isspace(): return text else: return None def csv(self, irc, msg, args, subreddits): """[subreddit_1] [subreddit_2] [subreddit_3] [...etc.] Load subreddit comments into csv files """ channel = msg.args[0].lower() for subreddit in subreddits.lower().strip().split(' '): self.latest_timestamp = None self.comments_list = [] self.max_comments = 20000 data = [] writer = csv.writer(open("{0}/data/{1}.csv".format(os.path.dirname(os.path.abspath(__file__)), subreddit), "w", newline="", encoding="utf-8")) irc.reply("Downloading:", subreddit) tries = 0 while len(data) <= self.max_comments: if tries >= 50: break data += self.create_csv(subreddit, self.latest_timestamp) tries += 1 writer.writerows(data) irc.reply("Retrieved {0} comments from {1}".format(len(data), subreddit)) del data del self.comments_list csv = wrap(csv, ['text']) def csv2model(self, irc, msg, args, channel, subreddits): """[channel] [subreddit_1] [subreddit_2] [subreddit_3] [...etc.] Load subreddit comment csv files inro your conversational model """ if not channel: channel = msg.args[0] channel = channel.lower() self.allowed_subreddits = [] self.word_dictionary = {} self.comments_list = [] self.order = 2 subreddits = subreddits.lower().strip().split(' ') # We save the dict as a pickle so we can reuse it on the bot script. data = self.create_model(subreddits) with open("{0}/data/{1}.pickle".format(os.path.dirname(os.path.abspath(__file__)), channel), "wb") as model_file: pickle.dump(data, model_file) irc.reply("Modeled {0} comments from {1}".format(len(data), subreddits)) del data del self.word_dictionary del self.comments_list csv2model = wrap(csv2model, [optional('channel'), 'text']) def seddit(self, irc, msg, args, channel, text): """[channel] Respomd to using channel conversational model """ if not channel: channel = msg.args[0] channel = channel.lower() try: new_comment = self.generate_comment(model=self.model[channel], order=2, number_of_sentences=2, initial_prefix=self.get_prefix_with_context(self.model[channel], text)) except KeyError: self.model[channel] = self.read_model("{0}/data/{1}.pickle".format(os.path.dirname(os.path.abspath(__file__)), channel)) new_comment = self.generate_comment(model=self.model[channel], order=2, number_of_sentences=2, initial_prefix=self.get_prefix_with_context(self.model[channel], text)) except: return #model_keys = list(model.keys()) # Basic random. #new_comment = generate_comment(model=model, order=2, # number_of_sentences=2, # initial_prefix=random.choice(model_keys)) # Selective random. #new_comment = generate_comment(model=model, order=2, # number_of_sentences=2, # initial_prefix=get_prefix(model)) irc.reply(new_comment, prefixNick=False) seddit = wrap(seddit, [optional('channel'), 'text']) Class = RedditBot