#! /usr/bin/env sage # # Use sage to pick up sagenb notebook library # To fully test experimental pure Python, replace "sage" with "python" ################################################################################ # Copyright 2010 Robert A. Beezer # # Distributed under the terms of the GNU General Public License (GPL), # version 2 or any later version. The full text of the GPL is available at: # # http://www.gnu.org/licenses/ ################################################################################ class TeXtoSWS(object): def __init__(self, input_dir=None ): r""" Discover as much as possible about files that were output by tex4ht to a directory. INPUT: - ``input_dir`` - a directory that contains all of the output from a run of tex4ht on a latex file. This directory should contain the associated graphics files but we locate them later in the HTML sources. OUTPUT: Several items are set here. One is a ``basename`` which tex4ht will have derived from the original LaTeX source file. So if we begin with ``foo.tex`` all of the files involved will begin with ``foo``. Another item returned is a list of pairs. The second part of each pair is the filename for an HTML file. The first part of the pair is the (relative) worksheet number for that file, with counting starting at zero, and as a string (not an integer). The pairs are sorted according to the numerical value of this first string. The directory where all these files live is recorded as ``self._input_dir``. Based on the number of HTML files discovered, a ``_likely_format`` is set. """ from os import listdir # to inspect directory import re # to massage filenames if not input_dir: input_dir = './' directory = listdir(input_dir) # tex4ht builds HTML files and a CSS file # Infer basename of project from single CSS file in directory # Use this to find all HTML files cssfiles = [afile for afile in directory if afile.endswith('.css')] if len(cssfiles) != 1: raise ValueError('no CSS file, or multiple CSS files in directory') cssfilename = cssfiles[0] basename = cssfilename[:-4] # Find all html files # $ matches end-of-string, avoids backup-files with tildes (Robert Marik) htmlfile_pattern = re.compile( r'^' + basename + r'(li|)([0-9]*)(.html)$' ) files = [] for afile in directory: m = htmlfile_pattern.match(afile) if m: ws_number = m.group(2) # Main HTML file does not get a number from tex4ht # Fits best as worksheet 0 when there are multiple files if not ws_number: ws_number = '0' files.append((ws_number, afile)) files.sort(key=lambda f: int(f[0])) if len(files) == 1: self._likely_format = 'sws' else: # Need a new Sage container format here self._likely_format = 'tar' self._input_dir = input_dir self._basename = basename self._files = files def _parse_tex4ht(self, html_name, linkbase=None): r""" Bust up tex4ht output into a - title - a string - graphics - a list of filenames - cells - list of pairs ('plain'|'compute', ) where contents are XHTML, or un-delimited Sage code """ import xml.dom.minidom as dom import re # regular expressions for parsing import os # Using verbatim environments for Sage code # allows some XML escape codes to slip through. # <,> are two obvious ones and easy to handle. # The XML escape character, &, is trickier. # We only protect against breaking character # codes like Ӓ but not codes like <. # # Recognize when sage cells begin or end sage_start_pattern = re.compile( r'(.*)(.*)' ) sage_end_pattern = re.compile( r'(.*)(.*)' ) # Ampersands that don't begin a character code ampersand_pattern = re.compile( r'(&(?!#[0-9]*;))' ) sage_block = False xmlcontent = [] html_file = open(html_name,'r') ## count = 0 for aline in html_file.readlines(): if sage_block and sage_end_pattern.match(aline): sage_block = False elif sage_block: pieces = ampersand_pattern.split(aline) if len(pieces)>1: for i in range(len(pieces)): if pieces[i] == '&': pieces[i] = r'&' aline = ''.join(pieces) aline = aline.replace('<', r'<') aline = aline.replace('>', r'>') elif not(sage_block) and sage_start_pattern.match(aline): sage_block = True xmlcontent.append(aline) ## count+=1 ## print count, aline # Can now parse valid XHTML tree = dom.parseString( ''.join(xmlcontent) ) # Find a title (all of them really) titles = [] #for e in tree.getElementsByTagName('h2'): # if e.getAttribute('class') == 'titleHead': # for text in e.childNodes: # titles.append(text.data) for e in tree.getElementsByTagName('title'): for text in e.childNodes: titles.append(text.data) if not titles: titles = [''] # Find SVG graphics from pgf/tikz placed by tex4ht graphics = [] for e in tree.getElementsByTagName('object'): if e.hasAttribute('data'): graphics.append(e.getAttribute('data')) # Mirror above to grab "regular" graphicx \includegraphics for e in tree.getElementsByTagName('img'): if e.hasAttribute('src'): graphics.append(e.getAttribute('src')) new_src = os.path.basename(e.getAttribute('src')) e.setAttribute('src', new_src) # Find and modify links in place if linkbase: link_pattern = re.compile( r'^' + linkbase + r'(li|)([0-9]*)(.html)(.*)$' ) for e in tree.getElementsByTagName('a'): attr = e.attributes if e.hasAttribute('href'): url = e.getAttribute('href') m = link_pattern.match(url) if m: # Handle '' as 0 worksheet ws_number = m.group(2) if not ws_number: ws_number ='0' newlink = '../' + ws_number + '/' + m.group(4) # Change it here e.setAttribute('href', newlink) # Ignore headers/footers by starting with body tag # Collect text between compute cells # Identify text cells with , tag # as produced by custom configuration file for tex4ht bodies = tree.getElementsByTagName('body') thebody = bodies[0] cells = [] content = [] for e in thebody.childNodes: if e.nodeType == dom.Node.ELEMENT_NODE: tag = e.tagName if not(tag in ['script', 'noscript', 'sage']): content.append(e.toxml()) if tag == 'sage': cells.append(('plain', ''.join(content))) content=[] # Assume , block has just one child # AND text is 7-bit ASCII at this point cells.append(('compute', e.firstChild.data)) if content: cells.append(('plain', ''.join(content))) return titles[0], graphics, cells def _convert_one_file(self, html_name, css_name, nb, user, linkbase=None): r""" Create a single worksheet from a parsed tex4ht XHTML file. INPUT: - html_name - file name of HTML file - css_name - an associated CSS file - nb - a notebook to host worksheet creation temporarily - user - the user directory for worksheets in this notebook - linkinfo - reserved for linked worksheets OUTPUT: Returns a worksheet in ``nb``. """ import shutil # file copy() to data directory from sagenb.notebook.notebook import Notebook title, graphics, cells = self._parse_tex4ht(html_name, linkbase) # Link in CSS file as part of HTML version # Add to filename list for data directory content=[] content.append( r'' ) graphics.append(css_name) # Recognize cells, adorn compute cells for c in cells: if c[0] == 'plain': content.append(c[1]) if c[0] == 'compute': content.append('{{{' + c[1] + '}}}') # Build a worksheet in nb, and return it # Set title # Place files in data directory # Pack discovered graphics into data directory # Data directory does not exist initially # Side-effect of query is to build it W = nb.create_new_worksheet(title, user) datadir = W.data_directory() for filename in graphics: shutil.copy("./"+filename, datadir) W.edit_save(''.join(content).encode('ascii', 'xmlcharrefreplace')) nb.save_worksheet(W) return W def _create_single_sws(self, basename): r""" Creates a single Sage worksheet in a portable sws format from a one-section LaTeX document. INPUT: - `basename` - a string. This is the basename of the original LaTeX input file and the basename of the tex4ht output. So, for example, suppose your original file is foo.tex, and when processed by tex4ht it produces an HTML/jsMath file called foo.html, and an associated CSS file foo.css. You would provide `foo` as the input sting, and would end up creating ``foo.sws``. So this routine will create a single worksheet faithfully representing the original intent in the LaTeX file and possibly including Sage compute cells. This assumes the necessary files are in the current working directory. OUTPUT: This routine creates a file foo.sws in the current working directory. The return value is simply this filename as a string. """ # We make a temporary notebook to work in # This is located in $HOME/.sage/temp/hostname/pid/ # Temporary directory gets deleted automatically (as process ends?) from sage.misc.misc import tmp_dir from sagenb.notebook.notebook import Notebook nbdir = tmp_dir() + 'converter.sagenb' nb = Notebook(nbdir) W = self._convert_one_file(basename+'.html', basename+'.css', nb, 'admin') nb.export_worksheet(W.filename(), basename+'.sws') return basename+'.sws' def _pure_python(self, basename): r""" EXPERIMENTAL: Build an sws file without any notebook code. Assumes just a single file of HTML. Edit shebang to just call python, not sage """ import time # for last change in pickled worksheet info import tempfile import tarfile import cPickle import os import StringIO css_name = basename + '.css' html_name = basename + '.html' # Break out tex4ht output title, graphics, cells = self._parse_tex4ht(html_name, None) # Piece back together in worksheet format content=[] content.append( r'' ) graphics.append(css_name) # Recognize cells, adorn compute cells for c in cells: if c[0] == 'plain': content.append(c[1]) if c[0] == 'compute': content.append('{{{' + c[1] + '}}}') # Make a generic worksheet configuration as a Python dictionary basic = { 'name':title, 'system':'sage', 'owner':'admin', 'last_change':('admin', time.time()), } # Build sws as a tar file, with expected name prefix = self._input_dir + 'sage_worksheet/' T = tarfile.open(basename + '.sws', 'w:bz2') # Pickled configuration file fd, configfile = tempfile.mkstemp() config = cPickle.dumps(basic) open(configfile, 'w').write(config) T.add(configfile, prefix + 'worksheet_conf.pickle') os.unlink(configfile) os.fdopen(fd,'w').close() # Worksheet files, new and old styles body = ''.join(content).encode('ascii', 'xmlcharrefreplace') fd, wsfile = tempfile.mkstemp() open(wsfile, 'w').write(body) T.add(wsfile, prefix + 'worksheet.html') os.unlink(wsfile) os.fdopen(fd,'w').close() # For older versions of notebook, backward compatible # Just have two extra lines of info in header header = [title, '\n', 'system:', basic['system'], '\n'] header = ''.join(header).encode('ascii', 'xmlcharrefreplace') fd_old, oldwsfile = tempfile.mkstemp() open(oldwsfile, 'w').write(header + body) T.add(oldwsfile, prefix + 'worksheet.txt') os.unlink(oldwsfile) os.fdopen(fd_old,'w').close() # End backward compatibility # Data files, graphics, css, whatever dataprefix = prefix + 'data/' for f in graphics: T.add(f, dataprefix + f) T.close() def _create_tar_archive(self, basename): # this is all ad-hoc for testing # long-term the notebook might be temporary, or not # One approach would be a portable container, like sws # Other would be to install directly in a user's notebook from sagenb.notebook.notebook import Notebook nbdir = "/tmp/fcla.sagenb" nb=Notebook(nbdir) nb.add_user('linear', 'algebra', 'none@nobody.com', account_type='user', force=True) cssfilename = basename + '.css' for _,htmlfilename in self._files: print "Converting: ", htmlfilename self._convert_one_file(htmlfilename, cssfilename, nb, 'linear', basename) nb.save() # for good measure # Bundle up as an archive # Get pathnames right for ez decompression as user import tarfile, os print "Forming tar archive..." T = tarfile.open(basename+'.tar.bz2', 'w:bz2') os.chdir(nbdir+'/home') T.add('linear') T.close() def convert(self, dir = None, format = None): r""" The one public method. """ if not format: format = self._likely_format if not dir: dir = self._input_dir # pass a directory to _create_single_sws? if format == 'sws': self._create_single_sws(self._basename) if format == 'tar': self._create_tar_archive(self._basename) ## Calls to testing routines, not permanent if format == 'xml-test': print self._parse_tex4ht(self._basename+'.html', self._basename) if format == 'pure-python': print self._pure_python(self._basename) ############################ # Main ############################ # # Create converter class # Call convert() t2s = TeXtoSWS() t2s.convert() ## Testing, experimental calls ## t2s.convert(format = 'pure-python') ## t2s.convert(format='xml-test')