#!/usr/bin/env python

# txt2keywords.sh - given a file, output a tab-delimited list of keywords


# configure
TOPN  = 0.005
MODEL = 'en_core_web_sm'

# require
import textacy.preprocessing
from textacy.ke.scake import scake
from textacy.ke.yake import yake
import spacy
import os
import sys

# sanity check
if len( sys.argv ) != 2 :
	sys.stderr.write( 'Usage: ' + sys.argv[ 0 ] + " <file>\n" )
	quit()

# initialize
file = sys.argv[ 1 ]

# open the given file and unwrap it
with open(file) as f: text = f.read()
text = textacy.preprocessing.normalize.normalize_quotation_marks( text )
text = textacy.preprocessing.normalize.normalize_hyphenated_words( text )
text = textacy.preprocessing.normalize.normalize_whitespace( text )

# compute the identifier
id = os.path.basename( os.path.splitext( file )[ 0 ] )

# initialize model
maximum = len( text ) + 1
model   = spacy.load( MODEL, max_length=maximum )
doc     = model( text )

# output a header
print( "id\tkeyword" )

# track found keywords to avoid duplicates
keywords = set()

# process and output each keyword with yake, will produce unigrams
for keyword, score in ( yake( doc,  topn=TOPN ) ) :
	if keyword not in keywords:
		print( "\t".join( [ id, keyword ] ) )
		keywords.add(keyword)

# process and output each keyword with scake, will typically produce keyphrases
# removing lemmatization with normalize=None seems to produce better results
for keyword, score in ( scake( doc, normalize=None, topn=TOPN ) ) :
	if keyword not in keywords:
		print( "\t".join( [ id, keyword ] ) )
		keywords.add(keyword)

# done
exit()