use most accurate language from modeline lexer and filename lexer Pygments guessing functions

This commit is contained in:
Alan Hamlett 2015-06-11 14:11:10 -07:00
parent 127fd115d0
commit 36f6bb9a21

View file

@ -20,7 +20,7 @@ if sys.version_info[0] == 2:
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), 'packages', 'pygments_py2')) sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), 'packages', 'pygments_py2'))
else: else:
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), 'packages', 'pygments_py3')) sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), 'packages', 'pygments_py3'))
from pygments.lexers import guess_lexer_for_filename from pygments.lexers import guess_lexer, guess_lexer_for_filename
log = logging.getLogger('WakaTime') log = logging.getLogger('WakaTime')
@ -47,20 +47,67 @@ TRANSLATIONS = {
def guess_language(file_name): def guess_language(file_name):
language, lexer = None, None """Guess lexer and language for a file.
try:
with open(file_name, 'r', encoding='utf-8') as fh: Returns (language, lexer) tuple where language is a unicode string.
lexer = guess_lexer_for_filename(file_name, fh.read(512000)) """
except:
pass lexer = smart_guess_lexer(file_name)
language = None
# guess language from file extension
if file_name: if file_name:
language = guess_language_from_extension(file_name.rsplit('.', 1)[-1]) language = guess_language_from_extension(file_name.rsplit('.', 1)[-1])
if lexer and language is None:
# get language from lexer if we didn't have a hard-coded extension rule
if language is None and lexer:
language = translate_language(u(lexer.name)) language = translate_language(u(lexer.name))
return language, lexer return language, lexer
def smart_guess_lexer(file_name):
"""Guess Pygments lexer for a file.
Looks for a vim modeline in file contents, then compares the accuracy
of that lexer with a second guess. The second guess looks up all lexers
matching the file name, then runs a text analysis for the best choice.
"""
lexer = None
text = get_file_contents(file_name)
try:
guess_1 = guess_lexer(text)
except:
guess_1 = None
try:
guess_2 = guess_lexer_for_filename(file_name, text)
except:
guess_2 = None
try:
accuracy_1 = guess_1.analyse_text(text)
except:
accuracy_1 = None
try:
accuracy_2 = guess_2.analyse_text(text)
except:
accuracy_2 = None
if accuracy_1:
lexer = guess_1
if (accuracy_2 and
(not accuracy_1 or accuracy_2 > accuracy_1)):
lexer = guess_2
return lexer
def guess_language_from_extension(extension): def guess_language_from_extension(extension):
"""Checks hard-coded extension map for a matching language.
"""
if extension: if extension:
if extension in EXTENSIONS: if extension in EXTENSIONS:
return EXTENSIONS[extension] return EXTENSIONS[extension]
@ -70,6 +117,9 @@ def guess_language_from_extension(extension):
def translate_language(language): def translate_language(language):
"""Turns Pygments lexer class name string into human-readable language.
"""
if language in TRANSLATIONS: if language in TRANSLATIONS:
language = TRANSLATIONS[language] language = TRANSLATIONS[language]
return language return language
@ -107,3 +157,16 @@ def get_file_stats(file_name, notfile=False, lineno=None, cursorpos=None):
'cursorpos': cursorpos, 'cursorpos': cursorpos,
} }
return stats return stats
def get_file_contents(file_name):
"""Returns the first 512000 bytes of the file's contents.
"""
text = None
try:
with open(file_name, 'r', encoding='utf-8') as fh:
text = fh.read(512000)
except:
pass
return text