rana-cli/wakatime/packages/pygments/lexers/textfmts.py
2017-02-13 23:25:51 -08:00

297 lines
11 KiB
Python

# -*- coding: utf-8 -*-
"""
pygments.lexers.textfmts
~~~~~~~~~~~~~~~~~~~~~~~~
Lexers for various text formats.
:copyright: Copyright 2006-2017 by the Pygments team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
import re
from pygments.lexer import RegexLexer, bygroups
from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
Number, Generic, Literal
from pygments.util import ClassNotFound
__all__ = ['IrcLogsLexer', 'TodotxtLexer', 'HttpLexer', 'GettextLexer']
class IrcLogsLexer(RegexLexer):
"""
Lexer for IRC logs in *irssi*, *xchat* or *weechat* style.
"""
name = 'IRC logs'
aliases = ['irc']
filenames = ['*.weechatlog']
mimetypes = ['text/x-irclog']
flags = re.VERBOSE | re.MULTILINE
timestamp = r"""
(
# irssi / xchat and others
(?: \[|\()? # Opening bracket or paren for the timestamp
(?: # Timestamp
(?: (?:\d{1,4} [-/])* # Date as - or /-separated groups of digits
(?:\d{1,4})
[T ])? # Date/time separator: T or space
(?: \d?\d [:.])* # Time as :/.-separated groups of 1 or 2 digits
(?: \d?\d)
)
(?: \]|\))?\s+ # Closing bracket or paren for the timestamp
|
# weechat
\d{4}\s\w{3}\s\d{2}\s # Date
\d{2}:\d{2}:\d{2}\s+ # Time + Whitespace
|
# xchat
\w{3}\s\d{2}\s # Date
\d{2}:\d{2}:\d{2}\s+ # Time + Whitespace
)?
"""
tokens = {
'root': [
# log start/end
(r'^\*\*\*\*(.*)\*\*\*\*$', Comment),
# hack
("^" + timestamp + r'(\s*<[^>]*>\s*)$', bygroups(Comment.Preproc, Name.Tag)),
# normal msgs
("^" + timestamp + r"""
(\s*<.*?>\s*) # Nick """,
bygroups(Comment.Preproc, Name.Tag), 'msg'),
# /me msgs
("^" + timestamp + r"""
(\s*[*]\s+) # Star
(\S+\s+.*?\n) # Nick + rest of message """,
bygroups(Comment.Preproc, Keyword, Generic.Inserted)),
# join/part msgs
("^" + timestamp + r"""
(\s*(?:\*{3}|<?-[!@=P]?->?)\s*) # Star(s) or symbols
(\S+\s+) # Nick + Space
(.*?\n) # Rest of message """,
bygroups(Comment.Preproc, Keyword, String, Comment)),
(r"^.*?\n", Text),
],
'msg': [
(r"\S+:(?!//)", Name.Attribute), # Prefix
(r".*\n", Text, '#pop'),
],
}
class GettextLexer(RegexLexer):
"""
Lexer for Gettext catalog files.
.. versionadded:: 0.9
"""
name = 'Gettext Catalog'
aliases = ['pot', 'po']
filenames = ['*.pot', '*.po']
mimetypes = ['application/x-gettext', 'text/x-gettext', 'text/gettext']
tokens = {
'root': [
(r'^#,\s.*?$', Keyword.Type),
(r'^#:\s.*?$', Keyword.Declaration),
# (r'^#$', Comment),
(r'^(#|#\.\s|#\|\s|#~\s|#\s).*$', Comment.Single),
(r'^(")([A-Za-z-]+:)(.*")$',
bygroups(String, Name.Property, String)),
(r'^".*"$', String),
(r'^(msgid|msgid_plural|msgstr|msgctxt)(\s+)(".*")$',
bygroups(Name.Variable, Text, String)),
(r'^(msgstr\[)(\d)(\])(\s+)(".*")$',
bygroups(Name.Variable, Number.Integer, Name.Variable, Text, String)),
]
}
class HttpLexer(RegexLexer):
"""
Lexer for HTTP sessions.
.. versionadded:: 1.5
"""
name = 'HTTP'
aliases = ['http']
flags = re.DOTALL
def get_tokens_unprocessed(self, text, stack=('root',)):
"""Reset the content-type state."""
self.content_type = None
return RegexLexer.get_tokens_unprocessed(self, text, stack)
def header_callback(self, match):
if match.group(1).lower() == 'content-type':
content_type = match.group(5).strip()
if ';' in content_type:
content_type = content_type[:content_type.find(';')].strip()
self.content_type = content_type
yield match.start(1), Name.Attribute, match.group(1)
yield match.start(2), Text, match.group(2)
yield match.start(3), Operator, match.group(3)
yield match.start(4), Text, match.group(4)
yield match.start(5), Literal, match.group(5)
yield match.start(6), Text, match.group(6)
def continuous_header_callback(self, match):
yield match.start(1), Text, match.group(1)
yield match.start(2), Literal, match.group(2)
yield match.start(3), Text, match.group(3)
def content_callback(self, match):
content_type = getattr(self, 'content_type', None)
content = match.group()
offset = match.start()
if content_type:
from pygments.lexers import get_lexer_for_mimetype
possible_lexer_mimetypes = [content_type]
if '+' in content_type:
# application/calendar+xml can be treated as application/xml
# if there's not a better match.
general_type = re.sub(r'^(.*)/.*\+(.*)$', r'\1/\2',
content_type)
possible_lexer_mimetypes.append(general_type)
for i in possible_lexer_mimetypes:
try:
lexer = get_lexer_for_mimetype(i)
except ClassNotFound:
pass
else:
for idx, token, value in lexer.get_tokens_unprocessed(content):
yield offset + idx, token, value
return
yield offset, Text, content
tokens = {
'root': [
(r'(GET|POST|PUT|DELETE|HEAD|OPTIONS|TRACE|PATCH)( +)([^ ]+)( +)'
r'(HTTP)(/)(1\.[01])(\r?\n|\Z)',
bygroups(Name.Function, Text, Name.Namespace, Text,
Keyword.Reserved, Operator, Number, Text),
'headers'),
(r'(HTTP)(/)(1\.[01])( +)(\d{3})( +)([^\r\n]+)(\r?\n|\Z)',
bygroups(Keyword.Reserved, Operator, Number, Text, Number,
Text, Name.Exception, Text),
'headers'),
],
'headers': [
(r'([^\s:]+)( *)(:)( *)([^\r\n]+)(\r?\n|\Z)', header_callback),
(r'([\t ]+)([^\r\n]+)(\r?\n|\Z)', continuous_header_callback),
(r'\r?\n', Text, 'content')
],
'content': [
(r'.+', content_callback)
]
}
def analyse_text(text):
return text.startswith(('GET /', 'POST /', 'PUT /', 'DELETE /', 'HEAD /',
'OPTIONS /', 'TRACE /', 'PATCH /'))
class TodotxtLexer(RegexLexer):
"""
Lexer for `Todo.txt <http://todotxt.com/>`_ todo list format.
.. versionadded:: 2.0
"""
name = 'Todotxt'
aliases = ['todotxt']
# *.todotxt is not a standard extension for Todo.txt files; including it
# makes testing easier, and also makes autodetecting file type easier.
filenames = ['todo.txt', '*.todotxt']
mimetypes = ['text/x-todo']
# Aliases mapping standard token types of Todo.txt format concepts
CompleteTaskText = Operator # Chosen to de-emphasize complete tasks
IncompleteTaskText = Text # Incomplete tasks should look like plain text
# Priority should have most emphasis to indicate importance of tasks
Priority = Generic.Heading
# Dates should have next most emphasis because time is important
Date = Generic.Subheading
# Project and context should have equal weight, and be in different colors
Project = Generic.Error
Context = String
# If tag functionality is added, it should have the same weight as Project
# and Context, and a different color. Generic.Traceback would work well.
# Regex patterns for building up rules; dates, priorities, projects, and
# contexts are all atomic
# TODO: Make date regex more ISO 8601 compliant
date_regex = r'\d{4,}-\d{2}-\d{2}'
priority_regex = r'\([A-Z]\)'
project_regex = r'\+\S+'
context_regex = r'@\S+'
# Compound regex expressions
complete_one_date_regex = r'(x )(' + date_regex + r')'
complete_two_date_regex = (complete_one_date_regex + r'( )(' +
date_regex + r')')
priority_date_regex = r'(' + priority_regex + r')( )(' + date_regex + r')'
tokens = {
# Should parse starting at beginning of line; each line is a task
'root': [
# Complete task entry points: two total:
# 1. Complete task with two dates
(complete_two_date_regex, bygroups(CompleteTaskText, Date,
CompleteTaskText, Date),
'complete'),
# 2. Complete task with one date
(complete_one_date_regex, bygroups(CompleteTaskText, Date),
'complete'),
# Incomplete task entry points: six total:
# 1. Priority plus date
(priority_date_regex, bygroups(Priority, IncompleteTaskText, Date),
'incomplete'),
# 2. Priority only
(priority_regex, Priority, 'incomplete'),
# 3. Leading date
(date_regex, Date, 'incomplete'),
# 4. Leading context
(context_regex, Context, 'incomplete'),
# 5. Leading project
(project_regex, Project, 'incomplete'),
# 6. Non-whitespace catch-all
('\S+', IncompleteTaskText, 'incomplete'),
],
# Parse a complete task
'complete': [
# Newline indicates end of task, should return to root
(r'\s*\n', CompleteTaskText, '#pop'),
# Tokenize contexts and projects
(context_regex, Context),
(project_regex, Project),
# Tokenize non-whitespace text
('\S+', CompleteTaskText),
# Tokenize whitespace not containing a newline
('\s+', CompleteTaskText),
],
# Parse an incomplete task
'incomplete': [
# Newline indicates end of task, should return to root
(r'\s*\n', IncompleteTaskText, '#pop'),
# Tokenize contexts and projects
(context_regex, Context),
(project_regex, Project),
# Tokenize non-whitespace text
('\S+', IncompleteTaskText),
# Tokenize whitespace not containing a newline
('\s+', IncompleteTaskText),
],
}