aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMattias Andrée <maandree@operamail.com>2012-10-04 22:11:33 +0200
committerMattias Andrée <maandree@operamail.com>2012-10-04 22:11:33 +0200
commit1096f2c86ea0e07383b97c4aa8b148ca841cc0f9 (patch)
tree5d6b39af90a457f50c08fb264fbc18b6221bc2c3
parente6bdf2e5ba92b4f01178a1d1bd1d601da38375da (diff)
downloadponysay-1096f2c86ea0e07383b97c4aa8b148ca841cc0f9.tar.gz
ponysay-1096f2c86ea0e07383b97c4aa8b148ca841cc0f9.tar.bz2
ponysay-1096f2c86ea0e07383b97c4aa8b148ca841cc0f9.zip
spelling correction mechanism with weigthed character change
-rwxr-xr-xponysay.py162
1 files changed, 162 insertions, 0 deletions
diff --git a/ponysay.py b/ponysay.py
index 1ebd9c3..71ea42e 100755
--- a/ponysay.py
+++ b/ponysay.py
@@ -1711,6 +1711,168 @@ class UCS():
'''
+Class used for correcting spellos and typos,
+
+Note that this implementation will not find that correctly spelled word are correct faster than it corrects words.
+It is also limited to words of size 0 to 127 (inclusive)
+'''
+class SpelloCorrecter: # Naïvely and quickly proted and adapted from optimised Java, may not be the nicest, or even fast, Python code
+ def __init__(self, directories, ending):
+ self.weights = {'k' : {'c' : 0.25, 'g' : 0.75, 'q' : 0.125},
+ 'c' : {'k' : 0.25, 'g' : 0.75, 's' : 0.5, 'z' : 0.5, 'q' : 0.125},
+ 's' : {'z' : 0.25, 'c' : 0.5},
+ 'z' : {'s' : 0.25, 'c' : 0.5},
+ 'g' : {'k' : 0.75, 'c' : 0.75, 'q' : 0.9},
+ 'o' : {'u' : 0.5},
+ 'u' : {'o' : 0.5, 'v' : 0.75, 'w' : 0.5},
+ 'b' : {'v' : 0.75},
+ 'v' : {'b' : 0.75, 'w' : 0.5, 'u' : 0.7},
+ 'w' : {'v' : 0.5, 'u' : 0.5},
+ 'q' : {'c' : 0.125, 'k' : 0.125, 'g' : 0.9}}
+
+ self.corrections = None
+ self.dictionary = [None] * 513
+ self.reusable = [0] * 512
+ self.dictionaryEnd = 512
+ self.closestDistance = 0
+
+ self.M = [None] * 128
+ for y in range(0, 128):
+ self.M[y] = [0] * 128
+ self.M[y][0] = y
+ m0 = self.M[0]
+ x = 127
+ while x > -1:
+ m0[x] = x
+ x -= 1
+
+ previous = ""
+ self.dictionary[-1] = previous;
+
+ for directory in directories:
+ for filename : os.listdir(directory):
+ if (not endswith(filename, ending)) or (len(filename) - len(ending) > 127):
+ continue
+ proper = filename[:-len(ending)]
+
+ if dictionaryEnd == 0:
+ dictionaryEnd = len(self.dictionary)
+ self.reusable = [0] * dictionaryEnd + self.reusable
+ self.dictionary = [None] * dictionaryEnd + self.dictionary
+
+ dictionaryEnd -= 1
+ dictionary[dictionaryEnd] = proper
+ prevCommon = min(len(previous), len(proper))
+ for i in range(0, prevCommon):
+ if previous[i] != proper[i]:
+ prevCommon = i
+ break
+ previous = dictionary[dictionaryEnd]
+ reusable[dictionaryEnd] = prevCommon
+
+
+ '''
+ Finds the closests correct spelled word.
+ The input is just one word, and the output is tuple
+ with a list of the closest spellings, and the weigthed distance.
+ '''
+ def correct(self, used):
+ if len(used) < 127:
+ return ([used], 0)
+
+ __correct(used)
+ return (seld.corrections, self.closestDistance)
+
+
+ def __correct(self, used):
+ self.closestDistance = 0x7FFFFFFF
+ previous = self.dictionary[-1]
+ prevLen = 0
+ usedLen = len(used)
+
+ proper = None
+ prevCommon = 0
+
+ d = len(self.dictionary)
+ while d > self.dictionaryEnd:
+ d -= 1
+ proper = self.dictionary[d]
+ if abs(len(proper) - usedLen) <= self.closestDistance:
+ if previous == self.dictionary[d + 1]:
+ prevCommon = self.reusable[d];
+ else:
+ prevCommon = min(prevLen, len(proper))
+ for i in range(0, prevCommon):
+ if previous[i] != proper[i]:
+ prevCommon = i
+ break
+
+ skip = min(prevLen, len(proper))
+ i = prevCommon
+ while i < skip:
+ for u in range(0, usedLen):
+ if (used[u] == previous[i]) or (used[u] == proper[i]):
+ skip = i
+ break
+ i += 1
+
+ common = min(skip, min(usedLen, len(proper)))
+ for i in range(0, common):
+ if used[i] != proper[i]:
+ common = i
+ break
+
+ distance = self.__distance(proper, skip, proper.length, used, common, usedLen)
+
+ if self.closestDistance > distance:
+ self.closestDistance = distance
+ corrections = [proper]
+ elif self.closestDistance == distance:
+ corrections.append(proper)
+
+ previous = proper;
+ if distance >= 0x7FFFFF00:
+ prevLen = distance & 255
+ else:
+ prevLen = len(proper)
+
+
+ def __distance(self, proper, y0, yn, used, x0, xn):
+ my = self.M[y0]
+ for y in range(y0, yn):
+ best = 0x7FFFFFFF
+ p = proper[y]
+ myy = self.M[y + 1] # only one array bound check, and at most one + ☺
+ x = x0
+ while x < xn:
+ change = my[x]
+ u = used[x]
+ if p == u:
+ # commence black magick … twilight would be so disappointed
+ x += 1
+ myy[x] = change
+ best = min(best, change)
+ remove = myy[x]
+ add = my[x + 1]
+
+ cw = 1
+ if my[x] in self.weights:
+ if p in self.weights[u]:
+ cw = self.weights[u][p]
+
+ myy[x] = min(cw + change, 1 + min(remove, add))
+ if best > myy[x]:
+ best = myy[x]
+
+ if best > self.closestDistance:
+ return 0x7FFFFFFF | y
+ my = myy
+ return my[xn]
+
+
+
+
+'''
The user's home directory
'''
HOME = os.environ['HOME'] if 'HOME' in os.environ else os.path.expanduser('~')