ability to prioritize popular languages over uncommon languages

This commit is contained in:
Alan Hamlett 2017-02-26 17:05:55 -08:00
parent 9ce9d528fd
commit 8c2685696c
4 changed files with 74 additions and 5 deletions

View file

@ -0,0 +1 @@

View file

@ -190,7 +190,7 @@ class LanguagesTestCase(utils.TestCase):
language = None
self.assertEqual(self.patched['wakatime.offlinequeue.Queue.push'].call_args[0][0].get('language'), language)
def test_typescript_detected_correctly(self):
def test_typescript_detected_over_typoscript(self):
response = Response()
response.status_code = 500
self.patched['wakatime.packages.requests.adapters.HTTPAdapter.send'].return_value = response
@ -205,3 +205,19 @@ class LanguagesTestCase(utils.TestCase):
language = u('TypeScript')
self.assertEqual(self.patched['wakatime.offlinequeue.Queue.push'].call_args[0][0].get('language'), language)
def test_perl_detected_over_prolog(self):
response = Response()
response.status_code = 500
self.patched['wakatime.packages.requests.adapters.HTTPAdapter.send'].return_value = response
now = u(int(time.time()))
config = 'tests/samples/configs/good_config.cfg'
entity = 'tests/samples/codefiles/perl.pl'
args = ['--file', entity, '--config', config, '--time', now]
retval = execute(args)
self.assertEquals(retval, 102)
language = u('Perl')
self.assertEqual(self.patched['wakatime.offlinequeue.Queue.push'].call_args[0][0].get('language'), language)

View file

@ -450,7 +450,6 @@ class TypeScriptLexer(RegexLexer):
aliases = ['ts', 'typescript']
filenames = ['*.ts', '*.tsx']
mimetypes = ['text/x-typescript']
priority = 0.11
flags = re.DOTALL | re.MULTILINE

View file

@ -18,10 +18,12 @@ from .compat import u, open
from .dependencies import DependencyParser
from .packages.pygments.lexers import (
_iter_lexerclasses,
_fn_matches,
basename,
ClassNotFound,
find_lexer_class,
get_lexer_by_name,
guess_lexer_for_filename,
)
from .packages.pygments.modeline import get_filetype_from_buffer
@ -99,7 +101,7 @@ def smart_guess_lexer(file_name):
lexer = lexer1
if (lexer2 and accuracy2 and
(not accuracy1 or accuracy2 > accuracy1)):
lexer = lexer2 # pragma: nocover
lexer = lexer2
return lexer
@ -113,7 +115,7 @@ def guess_lexer_using_filename(file_name, text):
lexer, accuracy = None, None
try:
lexer = guess_lexer_for_filename(file_name, text)
lexer = custom_pygments_guess_lexer_for_filename(file_name, text)
except:
pass
@ -263,3 +265,54 @@ def get_file_head(file_name):
except:
log.traceback(logging.DEBUG)
return text
def custom_pygments_guess_lexer_for_filename(_fn, _text, **options):
"""Overwrite pygments.lexers.guess_lexer_for_filename to customize the
priority of different lexers based on popularity of languages."""
fn = basename(_fn)
primary = {}
matching_lexers = set()
for lexer in _iter_lexerclasses():
for filename in lexer.filenames:
if _fn_matches(fn, filename):
matching_lexers.add(lexer)
primary[lexer] = True
for filename in lexer.alias_filenames:
if _fn_matches(fn, filename):
matching_lexers.add(lexer)
primary[lexer] = False
if not matching_lexers:
raise ClassNotFound('no lexer for filename %r found' % fn)
if len(matching_lexers) == 1:
return matching_lexers.pop()(**options)
result = []
for lexer in matching_lexers:
rv = lexer.analyse_text(_text)
if rv == 1.0:
return lexer(**options)
result.append((rv, customize_priority(lexer)))
def type_sort(t):
# sort by:
# - analyse score
# - is primary filename pattern?
# - priority
# - last resort: class name
return (t[0], primary[t[1]], t[1].priority, t[1].__name__)
result.sort(key=type_sort)
return result[-1][1](**options)
CUSTOM_PRIORITIES = {
'perl': 0.1,
'perl6': 0.1,
'typescript': 0.11,
}
def customize_priority(lexer):
"""Return an integer priority for the given lexer object."""
if lexer.name.lower() in CUSTOM_PRIORITIES:
lexer.priority = CUSTOM_PRIORITIES[lexer.name.lower()]
return lexer