Subject: [challenge] AutoDecipher 0.1 - The handy automatic deciphering tool.

Posted on: October 02 2011 @ 09:16 AM
By: wvf

Content:

Just a heads-up, this post (but not this paragraph) is rather full of spoilers. You can find the original deciphering tool here. At the moment, if you want to decipher something, this is the recommended version. AutoDecipher requires a rather high % of words guessed right to be useful.

Following the warm reception, nay, the smashing success of my first ciphering script, I've been working on AutoDecipher.py. This handy-dandy script, given the input of

PHP Formatted Code
CORPORAL PUNISHMENT
HAVELOCK
STERN
as possible words in the cipher and a cipher written by one of the two outputs a mostly-correct and very readable message, as long as the message is
PHP Formatted Code
Jy oamq Tlqnlqmi Nukfrdjaks,
Niamra saii ja jlqa.

Yluqr,
-Dmvailth
It works on my computer in about one second.

CHALLENGE:
Write a better deciphering program than I have. You can use any language, but you can't use a deciphering library.

The Competition:
My program requires a fairly high number of known, correctly-spelled words for it to guess correctly. For instance, in the above example, there are only a few mystery words to confuse the script. Right now, being version 0.1, AutoDecipher kind of sucks. This is the only script/hint pair it will read.
I plan a better release soon, but it's 2:30 in the morning here. Frown
Plus, I believe the real interest is in the ciphers themselves. However, I can't figure out how to decode those at the moment.

autodecipher.py:
PHP Formatted Code
dictionary_name = 'small_dictionary.txt'
cipher_name = 'in.txt'
hint_name = 'hint.txt'
deciphered_name = 'out.txt'

alphabet                      = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
template_alphabet       = "                          "

def patternize(word): #Returns a tuple containing the distinct letters of the word passed in and a list of when the letters occured in the word. Basically, it ciphers it on a word-by-word basis to a cipher-independant value. The same word when ciphered by any cipher will always return the same pattern, and that is what we'll match against in this program.#
        letters = ''
        for letter in word:
                if not letter in letters:
                        letters += letter
        wordPattern = []
        for letter in word:
                wordPattern.append(letters.index(letter))
        return (letters, wordPattern)
       
dictionary = open(dictionary_name, 'r')
hint = open(hint_name, 'r')
cipher = open(cipher_name, 'r')
decipher = open(deciphered_name, 'w')

dictData = []     ##### the dictionary for mass guessing
for line in dictionary:
        dictData.append(patternize(line.strip().upper()))
       
hintData = []       ##### the primary list
for line in hint:
        #hintData.append(patternize(line.strip().upper())) #This line suspended, for now, because the ciphered text data won't contain spaces so we don't need to match against them.
        for word in line.split():
                hintData.append(patternize(word.upper()))
hintData1 = []
hintData2 = []
for hint in hintData: #strip duplicate enteries. Sets don't work here.
        if not hint[0] in hintData1:
                hintData1 += [hint[0]]
                hintData2.append(hint)
hintData = hintData2

cipherData = []   ##### list of words in letter
cipherDataRaw = ''
for letter in cipher.read().upper():
        if letter.isalpha():
                cipherDataRaw += letter
        elif letter in [' ', '\n']:
                cipherDataRaw += '\n'
for line in cipherDataRaw.split():
        cipherData.append(patternize(line))
       
       
#So, now we have dictData, hintData, and cipherData.
#Let's try and extract what data we can from the hints.

def findHits(bullet, target):   #Give the words you're looking for, then the ciphered words.
        hits = []                                   #Returns the english hash, and the ciphered hashes that match.
        for possibleHit in bullet:
                for possibleMatch in target:
                        if possibleHit[1] == possibleMatch[1]:
                                hits.append((possibleHit[0], possibleMatch[0]))
        return hits

hintHits = findHits(hintData, cipherData)

def totalHits(hits):
        hitTotals = [] #a list containing, for each letter in the alphabet, a list containing a count of each potential cipher match for each letter. Basically, might [a][a]=0 and [z][z]=9, because a will never cipher to a and z will usually cipher to z.
        for letter in alphabet:
                hitTotal = []
                for letter in alphabet:
                        hitTotal.append(0)
                hitTotals.append(hitTotal)
               
        for hit in hits:
                for letter in range(len(hit[0])):
                        hitLetterIndex = alphabet.find(hit[0][letter])
                        matchLetterIndex = alphabet.find(hit[1][letter])
                        hitTotals[hitLetterIndex][matchLetterIndex] += 1
                       
        def topGreatest(itr, line): #Returns a list the most-occuring possible ciphers of letters. The list is _minimum_ itr long, but can be more.
                currentBest = ['', 0.5]
                for index in range(len(line)):
                        if line[index] > currentBest[1]:
                                currentBest = [alphabet[index], line[index]]
                        elif line[index] == currentBest[1]:
                                currentBest[0] += alphabet[index]
                if currentBest[0] != '':
                        if itr > 1:
                                for letter in currentBest[0]:
                                        line[alphabet.find(letter)] = 0
                                return [currentBest] + topGreatest(itr-1, line)
                        return [currentBest]
                return []
               
        greatestHitTotals=[]
        for line in hitTotals:
                greatestHitTotals.append(topGreatest(3, line))
               
        return greatestHitTotals
       
def simpleCypherExtractor(totals): #Takes the output table of totalHits().
        cypher = ''
        for line in totals:
                if len(line) > 0:
                        cypher += line[0][0][0]
                else:
                        cypher += ' '
        return cypher
       
def printDeciphered(dst, src, msg):
        lacking = ''
        for character in src:
                if not character.upper() in dst.upper():
                        lacking += character
        if len(lacking) > 0:
                print "Missing characters in cipher:", lacking, "\n"
       
        def cipher(char):
                if not char.isalpha(): #string not cypherable
                        return char
                letter = dst.upper().find(char.upper()) #String not in destination. (incomplete alphabet)
                if letter == -1: #If you're getting stars, you're using an incomplete source. Switch src and dst values.
                        return char.lower()
                letter = src[letter:letter+1]
                if letter == ' ':
                        return '*'
                return letter
       
        deciphered = ''
        for character in msg:
                deciphered += cipher(character)
        print deciphered
       
hintCipher = simpleCypherExtractor(totalHits(hintHits)) #So, for index 0, we have something like [['M', 2]]. This means that two words worked well with A being encoded as M.
       
#for index in range(len(hintHitTotals)):
#       print alphabet[index] + '> ' + str(hintHitTotals[index])
       
print "Partial Cypher:"
print "from: " + hintCipher
print "  to: " + alphabet
cipher.seek(0)
printDeciphered(hintCipher, alphabet, cipher.read().upper())


 


in.txt:
PHP Formatted Code
Jy oamq Tlqnlqmi Nukfrdjaks,
Niamra saii ja jlqa.

Yluqr,
-Dmvailth


hint.txt
PHP Formatted Code
CORPORAL PUNISHMENT
HAVELOCK
STERN


small_dictionary.txt: (unused atm, but must have present)
PHP Formatted Code
a
aah
aardvark
ab
aback
 


Have a nice day. Smile



Replies:

Re: [challenge] AutoDecipher 0.1 - The handy automatic deciphering tool.

Posted on: October 03 2011 @ 08:45 AM
By: wvf

Content:

Well, time for the final version. Unfortunately, it's totally broken and I have no idea why. Frown

PHP Formatted Code
dictionary_name = 'small_dictionary.txt'
cipher_name = 'in.txt'
hint_name = 'hint.txt'
deciphered_name = 'out.txt'

alphabet                      = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
template_alphabet       = "                          "

def patternize(word): #Returns a tuple containing the distinct letters of the word passed in and a list of when the letters occured in the word. Basically, it ciphers it on a word-by-word basis to a cipher-independant value. The same word when ciphered by any cipher will always return the same pattern, and that is what we'll match against in this program.#
        letters = ''
        for letter in word:
                if not letter in letters:
                        letters += letter
        wordPattern = []
        for letter in word:
                wordPattern.append(letters.index(letter))
        return (letters, wordPattern)
       
dictionary = open(dictionary_name, 'r')
hint = open(hint_name, 'r')
cipher = open(cipher_name, 'r')
decipher = open(deciphered_name, 'w')

print 'Processing dictionaries...'

dictData = []     ##### the dictionary for mass guessing
for line in dictionary:
        dictData.append(patternize(line.strip().upper()))
       
hintData = []       ##### the primary list
for line in hint:
        #hintData.append(patternize(line.strip().upper())) #This line suspended, for now, because the ciphered text data won't contain spaces so we don't need to match against them.
        for word in line.split():
                hintData.append(patternize(word.upper()))
dictionary.seek(0)
for line in dictionary:
        for word in line.split():
                hintData.append(patternize(word.upper()))
               
hintData1 = []
hintData2 = []
for hint in hintData: #strip duplicate enteries. Sets don't work here.
        if not hint[0] in hintData1:
                hintData1 += [hint[0]]
                hintData2.append(hint)
hintData = hintData2

cipherData = []   ##### list of words in letter
cipherDataRaw = ''
for letter in cipher.read().upper():
        if letter.isalpha():
                cipherDataRaw += letter
        elif letter in [' ', '\n']:
                cipherDataRaw += '\n'
for line in cipherDataRaw.split():
        cipherData.append(patternize(line))
       
       
print 'Matching dictionary data to cipher...'
       
#So, now we have dictData, hintData, and cipherData.
#Let's try and extract what data we can from the hints.

def findHits(bullet, target):   #Give the words you're looking for, then the ciphered words.
        hits = []                                   #Returns the english hash, and the ciphered hashes that match.
        for possibleHit in bullet:
                for possibleMatch in target:
                        if possibleHit[1] == possibleMatch[1]:
                                hits.append((possibleHit[0], possibleMatch[0]))
        return hits

hintHits = findHits(hintData, cipherData)

def totalHits(hits):
        hitTotals = [] #a list containing, for each letter in the alphabet, a list containing a count of each potential cipher match for each letter. Basically, might [a][a]=0 and [z][z]=9, because a will never cipher to a and z will usually cipher to z.
        for letter in alphabet:
                hitTotal = []
                for letter in alphabet:
                        hitTotal.append(0)
                hitTotals.append(hitTotal)
               
        for hit in hits:
                for letter in range(len(hit[0])):
                        hitLetterIndex = alphabet.find(hit[0][letter])
                        matchLetterIndex = alphabet.find(hit[1][letter])
                        hitTotals[hitLetterIndex][matchLetterIndex] += 1
                       
        def topGreatest(itr, line): #Returns a list the most-occuring possible ciphers of letters. The list is _minimum_ itr long, but can be more.
                currentBest = ['', 0.5]
                for index in range(len(line)):
                        if line[index] > currentBest[1]:
                                currentBest = [alphabet[index], line[index]]
                        elif line[index] == currentBest[1]:
                                currentBest[0] += alphabet[index]
                if currentBest[0] != '':
                        if itr > 1:
                                for letter in currentBest[0]:
                                        line[alphabet.find(letter)] = 0
                                return [currentBest] + topGreatest(itr-1, line)
                        return [currentBest]
                return []
               
        greatestHitTotals=[]
        for line in hitTotals:
                greatestHitTotals.append(topGreatest(3, line))
               
        return greatestHitTotals
       
def simpleCypherExtractor(totals): #Takes the output table of totalHits().
        cypher = ''
        for line in totals:
                if len(line) > 0:
                        cypher += line[0][0][0]
                else:
                        cypher += ' '
        return cypher
       
def printDeciphered(dst, src, msg):
        lacking = ''
        for character in src:
                if not character.upper() in dst.upper():
                        lacking += character
        if len(lacking) > 0:
                print "Missing characters in cipher:", lacking, "\n"
       
        def cipher(char):
                if not char.isalpha(): #string not cypherable
                        return char
                letter = dst.upper().find(char.upper()) #String not in destination. (incomplete alphabet)
                if letter == -1: #If you're getting stars, you're using an incomplete source. Switch src and dst values.
                        return char.lower()
                letter = src[letter:letter+1]
                if letter == ' ':
                        return '*'
                return letter
       
        deciphered = ''
        for character in msg:
                deciphered += cipher(character)
        print deciphered
       
hintCipher = simpleCypherExtractor(totalHits(hintHits)) #So, for index 0, we have something like [['M', 2]]. This means that two words worked well with A being encoded as M.
       
#for index in range(len(hintHitTotals)):
#       print alphabet[index] + '> ' + str(hintHitTotals[index])

print ""
print "Partial Cypher #1:"
print "from: " + hintCipher
print "  to: " + alphabet
print ""
cipher.seek(0)
printDeciphered(hintCipher, alphabet, cipher.read().upper())
print ""
print "Starting weighted match..."

#Tbh, the above doesn't work so hot if the hint.txt file has a few mismatches. Next, a weighted attempt...

def makeCiphersFrom(hits): #where hits could = [('STERN', 'AIHVP')]
        ciphers = []
        id = 0
        lastHit = ''
        for hit in hits:
                cipher = ''
                for letter in alphabet:
                        found = hit[0].find(letter)
                        if found >= 0:
                                cipher += hit[1][found]
                        else:
                                cipher += ' '
                if lastHit != hit[0]:
                        lastHit = hit[0]
                        id+=1
                ciphers.append([cipher, id])
        different_groups = ciphers[len(ciphers)-1][1]
        #print 'different groups', different_groups
        groupCounts = []
        for group in range(0,different_groups+1):
                groupCounts.append(100/(len(filter(lambda cipher_group: cipher_group[1]==group, ciphers))+5)) ##Adder is weight factor; must be at least 1. Raise if mis-spelled word influencing results too much?
        for cipher in ciphers:
                cipher.append(groupCounts[cipher[1]])
        return ciphers #A list of (cipher string, group, confidence)s.
       
def compoundCiphers(ciphers):
        resultHits = []
        for index in range(len(alphabet)):
                possibleHits = []
                #which strings might be ciphered to this?
                for cipher in ciphers:
                        cipher_char = cipher[0][index]
                        if cipher_char != ' ':
                                possibleHits.append([cipher_char, cipher[1], cipher[2]]) #letter-group-weight
                resultHits.append(possibleHits)
               
        resultTotals = []
        for index in range(len(alphabet)): #remove duplicate enteries in resultHits
                targetLetter = alphabet[index]
                targetHits = resultHits[index]     
                groupsChecked=[]
                possible=[]
                for letter in targetHits:
                        if not (letter[0], letter[1]) in groupsChecked:
                                groupsChecked.append((letter[0], letter[1]))
                                possible.append((letter[0], letter[2]))
                resultTotals.append(groupsChecked)
       
        compiledTotals = []
        def sumLetters(series):
                def listFind(element, list):
                        counter = 0
                        for elements in list:
                                if element == elements:
                                        return counter
                                counter += 1
                        return -1
                letters = []
                sums = []
                for element in series:
                        index = listFind(element[0], letters)
                        if index != -1:
                                sums[index] += element[1]
                        else:
                                letters.append(element[0])
                                sums.append(element[1])
                letter_sums = []
                for index in range(len(sums)):
                        letter_sums.append((letters[index], sums[index]))
                return letter_sums
               
        for index in range(len(alphabet)):
                compiledTotals.append((alphabet[index], sumLetters(resultTotals[index])))
       
       
        return compiledTotals
       
wordsToCiphers = makeCiphersFrom(hintHits)
fuzzyConfidenceCipher = compoundCiphers(wordsToCiphers)

def naiveDefuzzer(fuzzyCipher):
        def getBestLetter(letterLine):
                encodedLetter = ' '
                maxValue = 0
                for pair in letterLine[1]:
                        if pair[1] > maxValue:
                                encodedLetter = pair[0]
                return encodedLetter
        index = 0
        cipher = ''
        for letter in alphabet:
                cipher += getBestLetter(fuzzyCipher[index])
                index += 1
        return cipher

naiveCipher = naiveDefuzzer(fuzzyConfidenceCipher)
       
print ""
print "Partial Cypher #2:"
print "from: " + naiveCipher
print "  to: " + alphabet
print ""
cipher.seek(0)
printDeciphered(naiveCipher, alphabet, cipher.read().upper())
 


I hope you have better luck. Cry


The Improbable Island Enquirer - Forum
http://enquirer.improbableisland.com/forum/viewtopic.php?showtopic=25788