writing_with_film_subtitle_parser

Welcome to Etherpad!

This pad text is synchronized as you type, so that everyone viewing this page sees the same text. This allows you to collaborate seamlessly on documents!

Get involved with Etherpad at http://etherpad.org
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import random
import re
from main.settings import db
import srt
import datetime
from pattern.en import parse

## Name of original video
videofile   = 'input/weekly-address.mp4'
srtfile     = 'input/subtitle.vtt'

def convert_time(timestring):
    """ Converts a string into seconds """
    nums = map(float, re.findall(r'\d+', timestring))
    return 3600*nums[0] + 60*nums[1] + nums[2] + nums[3]/1000

def parse_vtt (lines):
    timed_texts = []
    current_times , current_text = None, ""
    for line in lines:
        times = re.findall("[0-9]*:[0-9]*:[0-9]*\.[0-9]*", line)
        ## Found a timecode
        if times != []:
            current_times = map(convert_time, times)

        ## Only a line-break. Add text and timecode to times_texts
        elif re.match(r"^\r?\n$", line) and current_times is not None:
            timed_texts.append((current_times, re.sub(r'\s$', '', current_text)))
            current_times, current_text = None, ""

        ## Extend texts
        elif current_times is not None:
            current_text = current_text + re.sub(r'\r?\n',' ', line)

    return timed_texts

for row in parse_vtt(open(srtfile).readlines()):
    sentence = {
        'filename': videofile,
        'text': row[1],
        'start': row[0][0],
        'end': row[0][1],
        'duration': row[0][1] - row[0][0],
        'words': []
    }

    char_duration = sentence['duration'] / len(sentence['text'])
    offset = 0
    parsed_sentence = parse(sentence['text'], Tokenize=True, chunks=False)

    index = 0

    for chunk in parsed_sentence.split()[0]:
        word = chunk[0]
        tag = chunk[1]
        index = sentence['text'].find(word, index)

        word_start = index * char_duration
        word_duration = len(word) * char_duration
        word_end = word_start + word_duration

        sentence['words'].append({
            'word': word,
            'start': sentence['start'] + word_start,
            'end': sentence['start'] + word_end,
            'duration': word_duration,
            'tag': tag,
        })

    db.sentences.insert(sentence)