Optimize text sanitation

This commit is contained in:
Klaus-Uwe Mitterer 2017-02-21 16:10:39 +01:00
parent 4d76ee0116
commit 46a42222c8

View file

@ -1,7 +1,7 @@
#!/usr/bin/env python3
import dbtools, setuptools, twitools
import argparse, html, markovify, nltk, operator, random, re, sys
import argparse, html, markovify, nltk, operator, random, re, string, sys
class Possy(markovify.NewlineText):
def word_split(self, sentence):
@ -14,18 +14,22 @@ class Possy(markovify.NewlineText):
return sentence
def sanitizeText(text):
split = text.split()
try:
if text[0] == "@":
return sanitizeText(text.partition(" ")[2])
if text.split()[-1][0] == "@":
return sanitizeText(" ".join(text.split()[:-1]))
if "@" in (text[0], text[1]):
if split[1][0] not in string.ascii_lowercase:
return sanitizeText(text.partition(" ")[2])
if split[-1][0] == "@":
return sanitizeText(" ".join(split[:-1]))
if text[:4] == "RT @":
return sanitizeText(text.partition(":")[2])
except:
return ""
return text
def getText(db = dbtools.dbHelper()):
text = ""
for string in db.executeQuery('SELECT text FROM tweets WHERE text NOT LIKE "RT %";'):
for string in db.executeQuery('SELECT text FROM tweets;'):
text += sanitizeText(string[0]) + "\n"
return html.unescape("".join([s for s in text.strip().splitlines(True) if s.strip()]))