oddluck-limnoria-plugins/RedditBot/plugin.py

411 lines
18 KiB
Python

###
# Copyright (c) 2019 oddluck
# All rights reserved.
#
#
###
import supybot.utils as utils
from supybot.commands import *
import supybot.plugins as plugins
import supybot.ircutils as ircutils
import supybot.callbacks as callbacks
import supybot.ircmsgs as ircmsgs
import supybot.log as log
import os
import csv
import time
from datetime import datetime
import pickle
import requests
import random
import re
from nltk.tokenize import sent_tokenize
from ftfy import fix_text
try:
from supybot.i18n import PluginInternationalization
_ = PluginInternationalization('RedditBot')
except ImportError:
# Placeholder that allows to run the plugin on a bot
# without the i18n module
_ = lambda x: x
class RedditBot(callbacks.Plugin):
"""Generates chat replies using subreddit comments"""
threaded = True
def __init__(self, irc):
self.__parent = super(RedditBot, self)
self.__parent.__init__(irc)
self.stopwords = self.add_extra_words()
self.model = {}
self.MATCH_MESSAGE_STRIPNICK = re.compile('^(<[^ ]+> )?(?P<message>.*)$')
def add_extra_words(self):
"""Adds the title and uppercase version of all words to STOP_WORDS.
We parse local copies of stop words downloaded from the following repositories:
https://github.com/stopwords-iso/stopwords-es
https://github.com/stopwords-iso/stopwords-en
"""
ES_STOPWORDS_FILE = "{}/stopwords-es.txt".format(os.path.dirname(os.path.abspath(__file__)))
EN_STOPWORDS_FILE = "{}/stopwords-en.txt".format(os.path.dirname(os.path.abspath(__file__)))
STOP_WORDS = set()
with open(ES_STOPWORDS_FILE, "r", encoding="utf-8") as temp_file:
for word in temp_file.read().splitlines():
STOP_WORDS.add(word)
with open(EN_STOPWORDS_FILE, "r", encoding="utf-8") as temp_file:
for word in temp_file.read().splitlines():
STOP_WORDS.add(word)
extra_words = list()
for word in STOP_WORDS:
extra_words.append(word.title())
extra_words.append(word.upper())
for word in extra_words:
STOP_WORDS.add(word)
return STOP_WORDS
def read_model(self, file_name):
"""Loads the specified pickle file.
Parameters
----------
file_name : str
The location the pickle file.
Returns
-------
dict
The dictionary inside the pickle.
"""
with open(file_name, "rb") as model_file:
return pickle.load(model_file)
def get_prefix(self, model):
"""Get a random prefix that starts in uppercase.
Parameters
----------
model : dict
The dictionary containing all the pairs and their possible outcomes.
Returns
-------
str
The randomly selected prefix.
"""
model_keys = list(model.keys())
# We give it a maximum of 10,000 tries.
for _ in range(10000):
random_prefix = random.choice(model_keys)
if random_prefix[0].isupper():
ends_with_punctuation = False
stripped_suffix = random_prefix.strip()
for char in [".", "?", "!"]:
if stripped_suffix[-1] == char:
ends_with_punctuation = True
break
if not ends_with_punctuation:
break
return random_prefix
def get_prefix_with_context(self, model, context):
"""Get a random prefix that matches the given context.
Parameters
----------
model : dict
The dictionary containing all the pairs and their possible outcomes.
context : str
A sentence which will be separated into keywords.
Returns
-------
str
The randomly selected context-aware prefix.
"""
# Some light cleanup.
context = context.replace("?", "").replace("!", "").replace(".", "")
context_keywords = context.split()
# we remove stop words from the context.
# We use reversed() to remove items from the list without affecting the sequence.
for word in reversed(context_keywords):
if len(word) <= 3 or word in self.stopwords:
context_keywords.remove(word)
# If our context has no keywords left we return a random prefix.
if len(context_keywords) == 0:
return self.get_prefix(model)
# We are going to sample one prefix for each available keyword and return only one.
model_keys = list(model.keys())
random.shuffle(model_keys)
sampled_prefixes = list()
for word in context_keywords:
for prefix in model_keys:
if word in prefix or word.lower() in prefix or word.title() in prefix:
sampled_prefixes.append(prefix)
break
# If we don't get any samples we fallback to the random prefix method.
if len(sampled_prefixes) == 0:
return self.get_prefix(model)
else:
return random.choice(sampled_prefixes)
def generate_comment(self, model, number_of_sentences, initial_prefix, order):
"""Generates a new comment using the model and an initial prefix.
Parameters
----------
model : dict
The dictionary containing all the pairs and their possible outcomes.
number_of_Sentences : int
The maximum number of sentences.
initial_prefix : str
The word(s) that will start the chain.
order : int
The number of words in the state, this must match the order number in step2.py
Returns
-------
str
The newly generated text.
"""
counter = 0
latest_suffix = initial_prefix
final_sentence = latest_suffix + " "
# We add a maximum sentence length to avoid going infinite in edge cases.
for _ in range(500):
try:
latest_suffix = random.choice(model[latest_suffix])
except:
# If we don't get another word we take another one randomly and continue the chain.
latest_suffix = self.get_prefix(model)
final_sentence += latest_suffix + " "
latest_suffix = " ".join(final_sentence.split()[-order:]).strip()
for char in [".", "?", "!"]:
if latest_suffix[-1] == char:
counter += 1
break
if counter >= number_of_sentences:
break
return final_sentence
def capsents(self, user_sentences):
sents = sent_tokenize(user_sentences)
capitalized_sents = [sent.capitalize() for sent in sents]
joined_ = ' '.join(capitalized_sents)
return joined_
def create_csv(self, subreddit, latest_timestamp=None):
"""
Downloads the subreddit comments, 500 at a time.
Parameters
----------
subreddit : str
The subreddit name.
latest_timestamp : int
The latest comment timestamp.
"""
base_url = "https://api.pushshift.io/reddit/comment/search/"
params = {"subreddit": subreddit, "sort": "desc",
"sort_type": "created_utc", "size": 500, "user_removed": False, "mod_removed": False}
# After the first call of this function we will use the 'before' parameter.
if latest_timestamp != None:
params["before"] = latest_timestamp
with requests.get(base_url, params=params) as response:
data = response.json()
total_posts = len(data["data"])
for item in data["data"]:
# We will only take 3 properties, the timestamp, subreddit and comment body.
self.latest_timestamp = item["created_utc"]
# We clean the greater-than and less-than and zero-width html code.
body = fix_text(item["body"])
body = re.sub("\(|\)|\[|\]|\{|\}|\*|\"", "", body)
body = self.capsents(body)
self.comments_list.append(
[body])
del data
return self.comments_list
def create_model(self, subreddits):
"""Reads the specified .csv file(s) and creates a training model from them.
It is important to note that we merge all comments into a big string.
This is to broaden the number of outcomes.
"""
for csv_file in subreddits:
# We iterate the .csv row by row.
for row in csv.reader(open("{0}/data/{1}.csv".format(os.path.dirname(os.path.abspath(__file__)), csv_file.lower()), "r")):
# Remove unnecessary whitespaces.
row = row[0].strip()
# We skip empty comments.
if len(row) == 0:
continue
# To improve results we ensure all comments end with a period.
ends_with_punctuation = False
for char in [".", "?", "!"]:
if row[-1] == char:
ends_with_punctuation = True
break
if not ends_with_punctuation:
row = row + "."
if len(self.allowed_subreddits) == 0:
self.comments_list.append(row)
else:
# We check if the subreddit comment is in our allowed subreddits list.
if row["subreddit"].lower() in self.allowed_subreddits:
self.comments_list.append(row)
# We separate each comment into words.
words_list = " ".join(self.comments_list).split()
for index, _ in enumerate(words_list):
# This will always fail in the last word since it doesn't have anything to pair it with.
try:
prefix = " ".join(words_list[index:index+self.order])
suffix = words_list[index+self.order]
# If the word is not in the dictionary, we init it with the next word.
if prefix not in self.word_dictionary.keys():
self.word_dictionary[prefix] = list([suffix])
else:
# Otherwise we append it to its inner list of outcomes.
self.word_dictionary[prefix].append(suffix)
except:
pass
del words_list
return self.word_dictionary
def doPrivmsg(self, irc, msg):
(channel, message) = msg.args
channel = channel.lower()
if callbacks.addressed(irc.nick, msg) or ircmsgs.isCtcp(msg) or not irc.isChannel(channel) or not self.registryValue('enable', channel):
return
if msg.nick.lower() in self.registryValue('ignoreNicks', channel):
log.debug("RedditBot: nick %s in ignoreNicks for %s" % (msg.nick, channel))
return
if irc.nick.lower() in message.lower():
# Were we addressed in the channel?
message = re.sub(re.escape(irc.nick), '', message, re.IGNORECASE)
probability = self.registryValue('probabilityWhenAddressed', channel)
else:
# Okay, we were not addressed, but what's the probability we should reply?
probability = self.registryValue('probability', channel)
#if self.registryValue('stripNicks'):
# removenicks = '|'.join(item + '\W.*?\s' for item in irc.state.channels[channel].users)
# text = re.sub(r'' + removenicks + '', 'MAGIC_NICK', text)
message = self.processText(channel, message) # Run text ignores/strips/cleanup.
if message and random.random() < probability and os.path.exists("{0}/data/{1}.pickle".format(os.path.dirname(os.path.abspath(__file__)), channel)):
try:
new_comment = self.generate_comment(model=self.model[channel], order=2,
number_of_sentences=2,
initial_prefix=self.get_prefix_with_context(self.model[channel], message))
except KeyError:
self.model[channel] = self.read_model("{0}/data/{1}.pickle".format(os.path.dirname(os.path.abspath(__file__)), channel))
new_comment = self.generate_comment(model=self.model[channel], order=2,
number_of_sentences=2,
initial_prefix=self.get_prefix_with_context(self.model[channel], message))
except:
return
new_comment = self.capsents(new_comment)
if new_comment and len(new_comment) > 1 and not new_comment.isspace():
irc.reply(new_comment, prefixNick=False)
def processText(self, channel, text):
match = False
ignore = self.registryValue("ignorePattern", channel)
strip = self.registryValue("stripPattern", channel)
text = ircutils.stripFormatting(text)
text = fix_text(text)
if self.registryValue('stripRelayedNick', channel):
text = self.MATCH_MESSAGE_STRIPNICK.match(text).group('message')
if ignore:
match = re.search(ignore, text)
if match:
log.debug("RedditBot: %s matches ignorePattern for %s" % (text, channel))
return
if strip:
match = re.findall(strip, text)
if match:
for x in match:
text = text.replace(x, '')
log.debug("RedditBot: %s matches stripPattern for %s. New text text: %s" % (x, channel, text))
if self.registryValue('stripURL', channel):
new_text = re.sub(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', text)
if new_text != text:
log.debug("RedditBot: url(s) stripped from text for %s. New text text: %s" % (channel, new_text))
text = new_text
text = text.strip() # Strip whitespace from beginning and the end of the string.
text = utils.str.normalizeWhitespace(text) # Normalize the whitespace in the string.
text = self.capsents(text)
if text and len(text) > 1 and not text.isspace():
return text
else:
return None
def csv(self, irc, msg, args, subreddits):
"""[subreddit_1] [subreddit_2] [subreddit_3] [...etc.]
Load subreddit comments into csv files
"""
channel = msg.args[0].lower()
for subreddit in subreddits.lower().strip().split(' '):
self.latest_timestamp = None
self.comments_list = []
self.max_comments = 20000
data = []
writer = csv.writer(open("{0}/data/{1}.csv".format(os.path.dirname(os.path.abspath(__file__)), subreddit),
"w", newline="", encoding="utf-8"))
irc.reply("Downloading:", subreddit)
tries = 0
while len(data) <= self.max_comments:
if tries >= 50:
break
data += self.create_csv(subreddit, self.latest_timestamp)
tries += 1
writer.writerows(data)
irc.reply("Retrieved {0} comments from {1}".format(len(data), subreddit))
del data
del self.comments_list
csv = wrap(csv, ['text'])
def csv2model(self, irc, msg, args, channel, subreddits):
"""[channel] [subreddit_1] [subreddit_2] [subreddit_3] [...etc.]
Load subreddit comment csv files inro your conversational model
"""
if not channel:
channel = msg.args[0]
channel = channel.lower()
self.allowed_subreddits = []
self.word_dictionary = {}
self.comments_list = []
self.order = 2
subreddits = subreddits.lower().strip().split(' ')
# We save the dict as a pickle so we can reuse it on the bot script.
data = self.create_model(subreddits)
with open("{0}/data/{1}.pickle".format(os.path.dirname(os.path.abspath(__file__)), channel), "wb") as model_file:
pickle.dump(data, model_file)
irc.reply("Modeled {0} comments from {1}".format(len(data), subreddits))
del data
del self.word_dictionary
del self.comments_list
csv2model = wrap(csv2model, [optional('channel'), 'text'])
def seddit(self, irc, msg, args, channel, text):
"""[channel] <text>
Respomd to <text> using channel conversational model
"""
if not channel:
channel = msg.args[0]
channel = channel.lower()
try:
new_comment = self.generate_comment(model=self.model[channel], order=2,
number_of_sentences=2,
initial_prefix=self.get_prefix_with_context(self.model[channel], text))
except KeyError:
self.model[channel] = self.read_model("{0}/data/{1}.pickle".format(os.path.dirname(os.path.abspath(__file__)), channel))
new_comment = self.generate_comment(model=self.model[channel], order=2,
number_of_sentences=2,
initial_prefix=self.get_prefix_with_context(self.model[channel], text))
except:
return
#model_keys = list(model.keys())
# Basic random.
#new_comment = generate_comment(model=model, order=2,
# number_of_sentences=2,
# initial_prefix=random.choice(model_keys))
# Selective random.
#new_comment = generate_comment(model=model, order=2,
# number_of_sentences=2,
# initial_prefix=get_prefix(model))
irc.reply(new_comment, prefixNick=False)
seddit = wrap(seddit, [optional('channel'), 'text'])
Class = RedditBot