Optimize text sanitation

2017-02-21 16:10:39 +01:00 · 2017-02-21 16:10:39 +01:00 · 46a42222c8
parent 4d76ee0116
commit 46a42222c8
1 changed files with 10 additions and 6 deletions
--- a/markov.py
+++ b/markov.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python3

 import dbtools, setuptools, twitools
-import argparse, html, markovify, nltk, operator, random, re, sys
+import argparse, html, markovify, nltk, operator, random, re, string, sys

 class Possy(markovify.NewlineText):
 def word_split(self, sentence):
@ -14,18 +14,22 @@ class Possy(markovify.NewlineText):
  return sentence

 def sanitizeText(text):
+ split = text.split()
 try:
-  if text[0] == "@":
-   return sanitizeText(text.partition(" ")[2])
-  if text.split()[-1][0] == "@":
-   return sanitizeText(" ".join(text.split()[:-1]))
+  if "@" in (text[0], text[1]):
+   if split[1][0] not in string.ascii_lowercase:
+    return sanitizeText(text.partition(" ")[2])
+  if split[-1][0] == "@":
+   return sanitizeText(" ".join(split[:-1]))
+  if text[:4] == "RT @":
+   return sanitizeText(text.partition(":")[2])
 except:
  return ""
 return text

 def getText(db = dbtools.dbHelper()):
 text = ""
- for string in db.executeQuery('SELECT text FROM tweets WHERE text NOT LIKE "RT %";'):
+ for string in db.executeQuery('SELECT text FROM tweets;'):
  text += sanitizeText(string[0]) + "\n"
 return html.unescape("".join([s for s in text.strip().splitlines(True) if s.strip()]))