#! /usr/bin/env sage
#
# Use sage to pick up sagenb notebook library
# To fully test experimental pure Python, replace "sage" with "python"

################################################################################
#            Copyright 2010 Robert A. Beezer <beezer@ups.edu>
#
#  Distributed under the terms of the GNU General Public License (GPL),
#  version 2 or any later version.  The full text of the GPL is available at:
#
#                     http://www.gnu.org/licenses/
################################################################################

class TeXtoSWS(object):

    def __init__(self, input_dir=None ):
        r"""
        Discover as much as possible about files that were
        output by tex4ht to a directory.

        INPUT:

        - ``input_dir`` - a directory that contains all of the
        output from a run of tex4ht on a latex file.  This
        directory should contain the associated graphics
        files but we locate them later in the HTML sources.

        OUTPUT:

        Several items are set here.  One is a ``basename`` which
        tex4ht will have derived from the original LaTeX source
        file.  So if we begin with ``foo.tex`` all of the
        files involved will begin with ``foo``.

        Another item returned is a list of pairs.  The
        second part of each pair is the filename for an
        HTML file.  The first part of the pair is the
        (relative) worksheet number for that file, with
        counting starting at zero, and as a string (not
        an integer). The pairs are sorted according to
        the numerical value of this first string.

        The directory where all these files live is recorded
        as ``self._input_dir``.  Based on the number of HTML
        files discovered, a ``_likely_format`` is set.
        """
        from os import listdir  # to inspect directory
        import re               # to massage filenames

        if not input_dir:
            input_dir = './'
        directory = listdir(input_dir)

        # tex4ht builds HTML files and a CSS file
        # Infer basename of project from single CSS file in directory
        # Use this to find all HTML files
        cssfiles = [afile for afile in directory if afile.endswith('.css')]
        if len(cssfiles) != 1:
            raise ValueError('no CSS file, or multiple CSS files in directory')
        cssfilename = cssfiles[0]
        basename = cssfilename[:-4]

        # Find all html files
        # $ matches end-of-string, avoids backup-files with tildes (Robert Marik)
        htmlfile_pattern = re.compile( r'^' + basename + r'(li|)([0-9]*)(.html)$' )
        files = []
        for afile in directory:
            m = htmlfile_pattern.match(afile)
            if m:
                ws_number = m.group(2)
                # Main HTML file does not get a number from tex4ht
                # Fits best as worksheet 0 when there are multiple files
                if not ws_number:
                    ws_number = '0'
                files.append((ws_number, afile))
        files.sort(key=lambda f: int(f[0]))
        if len(files) == 1:
            self._likely_format = 'sws'
        else:
            # Need a new Sage container format here
            self._likely_format = 'tar'
        self._input_dir = input_dir
        self._basename = basename
        self._files = files


    def _parse_tex4ht(self, html_name, linkbase=None):
        r"""
        Bust up tex4ht output into a

        - title - a string
        - graphics - a list of filenames
        - cells - list of pairs ('plain'|'compute', <contents>)
          where contents are XHTML, or un-delimited Sage code
        """
        import xml.dom.minidom as dom
        import re     # regular expressions for parsing
        import os

        #  Using verbatim environments for Sage code
        #  allows some XML escape codes to slip through.
        #  <,> are two obvious ones and easy to handle.
        #  The XML escape character, &, is trickier.
        #  We only protect against breaking character
        #  codes like &#1234;  but not codes like &lt;.
        #
        #  Recognize when sage cells begin or end
        sage_start_pattern = re.compile( r'(.*)<sage>(.*)' )
        sage_end_pattern = re.compile( r'(.*)</sage>(.*)' )
        #  Ampersands that don't begin a character code
        ampersand_pattern = re.compile( r'(&(?!#[0-9]*;))' )

        sage_block = False
        xmlcontent = []
        html_file = open(html_name,'r')
        ## count = 0
        for aline in html_file.readlines():
            if sage_block and sage_end_pattern.match(aline):
                sage_block = False
            elif sage_block:
                pieces = ampersand_pattern.split(aline)
                if len(pieces)>1:
                    for i in range(len(pieces)):
                        if pieces[i] == '&':
                            pieces[i] = r'&#38;'
                    aline = ''.join(pieces)
                aline = aline.replace('<', r'&#60;')
                aline = aline.replace('>', r'&#62;')
            elif not(sage_block) and sage_start_pattern.match(aline):
                sage_block = True
            xmlcontent.append(aline)
            ## count+=1
            ## print count, aline

        # Can now parse valid XHTML
        tree = dom.parseString( ''.join(xmlcontent) )

        # Find a title (all of them really)
        titles = []
        #for e in tree.getElementsByTagName('h2'):
        #    if e.getAttribute('class') == 'titleHead':
        #        for text in e.childNodes:
        #            titles.append(text.data)
        for e in tree.getElementsByTagName('title'):
            for text in e.childNodes:
                titles.append(text.data)
        if not titles:
            titles = ['']

        # Find SVG graphics from pgf/tikz placed by tex4ht
        graphics = []
        for e in tree.getElementsByTagName('object'):
            if e.hasAttribute('data'):
                graphics.append(e.getAttribute('data'))

        # Mirror above to grab "regular" graphicx \includegraphics
        for e in tree.getElementsByTagName('img'):
            if e.hasAttribute('src'):
                graphics.append(e.getAttribute('src'))
                new_src = os.path.basename(e.getAttribute('src'))
                e.setAttribute('src', new_src)

        # Find and modify links in place
        if linkbase:
            link_pattern = re.compile( r'^' + linkbase + r'(li|)([0-9]*)(.html)(.*)$' )
            for e in tree.getElementsByTagName('a'):
                attr = e.attributes
                if e.hasAttribute('href'):
                    url = e.getAttribute('href')
                    m = link_pattern.match(url)
                    if m:
                        # Handle '' as 0 worksheet
                        ws_number = m.group(2)
                        if not ws_number:
                            ws_number ='0'
                        newlink = '../' + ws_number + '/' + m.group(4)
                        # Change it here
                        e.setAttribute('href', newlink)

        # Ignore headers/footers by starting with body tag
        # Collect text between compute cells
        # Identify text cells with <sage>,</sage> tag
        # as produced by custom configuration file for tex4ht
        bodies = tree.getElementsByTagName('body')
        thebody = bodies[0]
        cells = []
        content = []
        for e in thebody.childNodes:
            if e.nodeType == dom.Node.ELEMENT_NODE:
                tag = e.tagName
                if not(tag in ['script', 'noscript', 'sage']):
                    content.append(e.toxml())
                if tag == 'sage':
                    cells.append(('plain', ''.join(content)))
                    content=[]
                    # Assume <sage>, </sage> block has just one child
                    # AND text is 7-bit ASCII at this point
                    cells.append(('compute', e.firstChild.data))
        if content:
            cells.append(('plain', ''.join(content)))
        return titles[0], graphics, cells


    def _convert_one_file(self, html_name, css_name, nb, user, linkbase=None):
        r"""
        Create a single worksheet from a parsed tex4ht XHTML file.

        INPUT:

        - html_name - file name of HTML file
        - css_name - an associated CSS file
        - nb - a notebook to host worksheet creation temporarily
        - user - the user directory for worksheets in this notebook
        - linkinfo - reserved for linked worksheets

        OUTPUT:

        Returns a worksheet in ``nb``.
        """
        import shutil # file copy() to data directory
        from sagenb.notebook.notebook import Notebook

        title, graphics, cells = self._parse_tex4ht(html_name, linkbase)

        # Link in CSS file as part of HTML version
        # Add to filename list for data directory
        content=[]
        content.append( r'<link type="text/css" rel="stylesheet" href="' + css_name + r'" />' )
        graphics.append(css_name)

        # Recognize cells, adorn compute cells
        for c in cells:
            if c[0] == 'plain':
                content.append(c[1])
            if c[0] == 'compute':
                content.append('{{{' + c[1] + '}}}')

        # Build a worksheet in nb, and return it
        #   Set title
        #   Place files in data directory
        #   Pack discovered graphics into data directory
        #   Data directory does not exist initially
        #   Side-effect of query is to build it
        W = nb.create_new_worksheet(title, user)
        datadir = W.data_directory()
        for filename in graphics:
            shutil.copy("./"+filename, datadir)
        W.edit_save(''.join(content).encode('ascii', 'xmlcharrefreplace'))
        nb.save_worksheet(W)
        return W


    def _create_single_sws(self, basename):
        r"""
        Creates a single Sage worksheet in a portable sws format from a one-section LaTeX document.

        INPUT:

        - `basename` - a string. This is the basename of the original
        LaTeX input file and the basename of the tex4ht output.
        So, for example, suppose your original file is foo.tex, and
        when processed by tex4ht it produces an HTML/jsMath file called
        foo.html, and an associated CSS file foo.css.  You would provide
        `foo` as the input sting, and would end up creating ``foo.sws``.
        So this routine will create a single worksheet faithfully representing
        the original intent in the LaTeX file and possibly including Sage
        compute cells. This assumes the necessary files are in the current
        working directory.

        OUTPUT:  This routine creates a file  foo.sws  in the current working directory.
        The return value is simply this filename as a string.
        """
        # We make a temporary notebook to work in
        # This is located in $HOME/.sage/temp/hostname/pid/
        # Temporary directory gets deleted automatically (as process ends?)
        from sage.misc.misc import tmp_dir
        from sagenb.notebook.notebook import Notebook
        nbdir = tmp_dir() + 'converter.sagenb'
        nb = Notebook(nbdir)
        W = self._convert_one_file(basename+'.html', basename+'.css', nb, 'admin')
        nb.export_worksheet(W.filename(), basename+'.sws')
        return basename+'.sws'

    def _pure_python(self, basename):
        r"""
        EXPERIMENTAL:
        Build an sws file without any notebook code.
        Assumes just a single file of HTML.
        Edit shebang to just call python, not sage
        """
        import time  # for last change in pickled worksheet info
        import tempfile
        import tarfile
        import cPickle
        import os
        import StringIO

        css_name = basename + '.css'
        html_name = basename + '.html'

        # Break out tex4ht output
        title, graphics, cells = self._parse_tex4ht(html_name, None)

        # Piece back together in worksheet format
        content=[]
        content.append( r'<link type="text/css" rel="stylesheet" href="' + css_name + r'" />' )
        graphics.append(css_name)

        # Recognize cells, adorn compute cells
        for c in cells:
            if c[0] == 'plain':
                content.append(c[1])
            if c[0] == 'compute':
                content.append('{{{' + c[1] + '}}}')

        # Make a generic worksheet configuration as a Python dictionary
        basic = {
            'name':title,
            'system':'sage',
            'owner':'admin',
            'last_change':('admin', time.time()),
            }

        # Build sws as a tar file, with expected name
        prefix = self._input_dir + 'sage_worksheet/'
        T = tarfile.open(basename + '.sws', 'w:bz2')

        # Pickled configuration file
        fd, configfile =  tempfile.mkstemp()
        config = cPickle.dumps(basic)
        open(configfile, 'w').write(config)
        T.add(configfile, prefix + 'worksheet_conf.pickle')
        os.unlink(configfile)
        os.fdopen(fd,'w').close()

        # Worksheet files, new and old styles
        body = ''.join(content).encode('ascii', 'xmlcharrefreplace')
        fd, wsfile =  tempfile.mkstemp()
        open(wsfile, 'w').write(body)
        T.add(wsfile, prefix + 'worksheet.html')
        os.unlink(wsfile)
        os.fdopen(fd,'w').close()
        #  For older versions of notebook, backward compatible
        #  Just have two extra lines of info in header
        header = [title, '\n', 'system:', basic['system'], '\n']
        header = ''.join(header).encode('ascii', 'xmlcharrefreplace')
        fd_old, oldwsfile =  tempfile.mkstemp()
        open(oldwsfile, 'w').write(header + body)
        T.add(oldwsfile, prefix + 'worksheet.txt')
        os.unlink(oldwsfile)
        os.fdopen(fd_old,'w').close()
        #  End backward compatibility

        # Data files, graphics, css, whatever
        dataprefix = prefix + 'data/'
        for f in graphics:
            T.add(f, dataprefix + f)

        T.close()


    def _create_tar_archive(self, basename):
        # this is all ad-hoc for testing
        # long-term the notebook might be temporary, or not
        # One approach would be a portable container, like sws
        # Other would be to install directly in a user's notebook
        from sagenb.notebook.notebook import Notebook
        nbdir = "/tmp/fcla.sagenb"
        nb=Notebook(nbdir)
        nb.add_user('linear', 'algebra', 'none@nobody.com', account_type='user', force=True)
        cssfilename = basename + '.css'
        for _,htmlfilename in self._files:
            print "Converting: ", htmlfilename
            self._convert_one_file(htmlfilename, cssfilename, nb, 'linear', basename)
        nb.save() # for good measure
        # Bundle up as an archive
        # Get pathnames right for ez decompression as user
        import tarfile, os
        print "Forming tar archive..."
        T = tarfile.open(basename+'.tar.bz2', 'w:bz2')
        os.chdir(nbdir+'/home')
        T.add('linear')
        T.close()

    def convert(self, dir = None, format = None):
        r"""
        The one public method.
        """
        if not format:
            format = self._likely_format
        if not dir:
            dir = self._input_dir
        # pass a directory to _create_single_sws?
        if format == 'sws':
            self._create_single_sws(self._basename)
        if format == 'tar':
            self._create_tar_archive(self._basename)
        ## Calls to testing routines, not permanent
        if format == 'xml-test':
            print self._parse_tex4ht(self._basename+'.html', self._basename)
        if format == 'pure-python':
            print self._pure_python(self._basename)

############################
# Main
############################
#
# Create converter class
# Call convert()

t2s = TeXtoSWS()
t2s.convert()
## Testing, experimental calls
## t2s.convert(format = 'pure-python')
## t2s.convert(format='xml-test')