#!/usr/bin/env python # txt2keywords.sh - given a file, output a tab-delimited list of keywords # configure TOPN = 0.005 MODEL = 'en_core_web_sm' # require import textacy.preprocessing from textacy.ke.scake import scake from textacy.ke.yake import yake import spacy import os import sys # sanity check if len( sys.argv ) != 2 : sys.stderr.write( 'Usage: ' + sys.argv[ 0 ] + " \n" ) quit() # initialize file = sys.argv[ 1 ] # open the given file and unwrap it with open(file) as f: text = f.read() text = textacy.preprocessing.normalize.normalize_quotation_marks( text ) text = textacy.preprocessing.normalize.normalize_hyphenated_words( text ) text = textacy.preprocessing.normalize.normalize_whitespace( text ) # compute the identifier id = os.path.basename( os.path.splitext( file )[ 0 ] ) # initialize model maximum = len( text ) + 1 model = spacy.load( MODEL, max_length=maximum ) doc = model( text ) # output a header print( "id\tkeyword" ) # track found keywords to avoid duplicates keywords = set() # process and output each keyword with yake, will produce unigrams for keyword, score in ( yake( doc, topn=TOPN ) ) : if keyword not in keywords: print( "\t".join( [ id, keyword ] ) ) keywords.add(keyword) # process and output each keyword with scake, will typically produce keyphrases # removing lemmatization with normalize=None seems to produce better results for keyword, score in ( scake( doc, normalize=None, topn=TOPN ) ) : if keyword not in keywords: print( "\t".join( [ id, keyword ] ) ) keywords.add(keyword) # done exit()