2 minute read

import nltk
import sys
import os
from math import log

FILE_MATCHES = 1
SENTENCE_MATCHES = 1


def main():

    # Check command-line arguments
    if len(sys.argv) != 2:
        sys.exit("Usage: python questions.py corpus")

    # Calculate IDF values across files
    files = load_files(sys.argv[1])
    file_words = {
        filename: tokenize(files[filename])
        for filename in files
    }
    file_idfs = compute_idfs(file_words)

    # Prompt user for query
    query = set(tokenize(input("Query: ")))

    # Determine top file matches according to TF-IDF
    filenames = top_files(query, file_words, file_idfs, n=FILE_MATCHES)

    # Extract sentences from top files
    sentences = dict()
    for filename in filenames:
        for passage in files[filename].split("\n"):
            for sentence in nltk.sent_tokenize(passage):
                tokens = tokenize(sentence)
                if tokens:
                    sentences[sentence] = tokens

    # Compute IDF values across sentences
    idfs = compute_idfs(sentences)

    # Determine top sentence matches
    matches = top_sentences(query, sentences, idfs, n=SENTENCE_MATCHES)
    for match in matches:
        print(match)


def load_files(directory):
    """
    Given a directory name, return a dictionary mapping the filename of each
    `.txt` file inside that directory to the file's contents as a string.
    """

    filenames = dict()

    for file in os.listdir(directory):
        with open(os.path.join(directory, file), encoding="utf-8") as content:
            filenames[file] = content.read()

    return filenames


def tokenize(document):
    """
    Given a document (represented as a string), return a list of all of the
    words in that document, in order.
    Process document by coverting all words to lowercase, and removing any
    punctuation or English stopwords.
    """

    results = []
    punctuation = [',', '.', '"', ';',
                   '(', ')', ':', '``', '`', "''", "'", '=', '%']

    for token in nltk.word_tokenize(document.lower()):
        if token.lower() not in punctuation and token.lower(
        ) not in nltk.corpus.stopwords.words("english"):
            results.append(token.lower())

    return results


def compute_idfs(documents):
    """
    Given a dictionary of `documents` that maps names of documents to a list
    of words, return a dictionary that maps words to their IDF values.
    Any word that appears in at least one of the documents should be in the
    resulting dictionary.
    """

    idf = {}
    count = {}

    for document in documents:

        for words in set(documents[document]):
            if words not in count:
                count[words] = 1
            else:
                count[words] += 1

    for word in count:
        idf[word] = log(len(documents) / count[word])

    return idf
 

def top_files(query, files, idfs, n):
    """
    Given a `query` (a set of words), `files` (a dictionary mapping names of
    files to a list of their words), and `idfs` (a dictionary mapping words
    to their IDF values), return a list of the filenames of the the `n` top
    files that match the query, ranked according to tf-idf.
    """

    result = {}

    for file, content in files.items():
        value = 0

        for word in query:
            if word in content:
                value += word.count(word) * idfs[word]

        if value != 0:
            result[file] = value

    return [x[0]
            for x in sorted(result.items(), key=lambda x: x[1], reverse=True)][:n]


def top_sentences(query, sentences, idfs, n):
    """
    Given a `query` (a set of words), `sentences` (a dictionary mapping
    sentences to a list of their words), and `idfs` (a dictionary mapping words
    to their IDF values), return a list of the `n` top sentences that match
    the query, ranked according to idf. If there are ties, preference should
    be given to sentences that have a higher query term density.
    """

    output = []

    for sentence in sentences:
        # sentence, idf and density
        scores = [sentence, 0, 0]

        for word in query:
            if word in sentences[sentence]:
                # idfs
                scores[1] += idfs[word]

                # density
                scores[2] += sentences[sentence].count(
                    word) / len(sentences[sentence])

        output.append(scores)

    output.sort(key=lambda x: (x[1], x[2]), reverse=True)

    top_sentence = []
    for i in range(n):
        top_sentence .append(output[i][0])
    return top_sentence


if __name__ == "__main__":
    main()