Index: libmpcodecs/vf_overlay.c
===================================================================
--- libmpcodecs/vf_overlay.c	(revision 0)
+++ libmpcodecs/vf_overlay.c	(revision 0)
@@ -0,0 +1,1457 @@
+/* Copyright 2007 Jason Tackaberry <tack@urandom.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+/**
+ * \file vf_overlay.c
+ *
+ * \brief Shared memory image overlay with alpha compositing.
+ *
+ * See DOCS/tech/vf_overlay.txt for full documentation.
+ */
+
+#include "config.h"
+
+#ifdef HAVE_SHM
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+#include <inttypes.h>
+
+#include "mp_msg.h"
+#include "libvo/fastmemcpy.h"
+#include "libvo/video_out.h"
+#include "libswscale/swscale.h"
+#include "input/input.h"
+#include "osdep/timer.h"
+#include "cpudetect.h"
+#include "mangle.h"
+
+#include "mp_image.h"
+#include "vf.h"
+#include "img_format.h"
+#include "libavutil/avutil.h"
+#include "vf_scale.h"
+
+
+/// If defined will output some timing data. Useful for profiling.
+//#define STOPWATCH 8
+/// Turn off MMX for debugging.
+//#undef HAVE_MMX
+
+
+/// \name Convenience macros.
+//@{
+#define C64(x) ((uint64_t)((x)|(x)<<16))<<32 | (uint64_t)(x) | (uint64_t)(x)<<16
+#define clamp(a,min,max) (((a)>(max))?(max):(((a)<(min))?(min):(a)))
+//@}
+
+
+/** \name Overlay image buffer lock flags
+ *  \brief Lock flags for controlling the state of the overlay shmem buffer.
+ *
+ * The first byte of the overlay shared memory buffer is set to one of the
+ * following:
+ */
+//@{
+/** Overlay buffer is available for writing. vf_overlay sets this flag when it
+ *  is finished reading from the shared buffer and when it first initializes.
+ */
+#define BUFFER_UNLOCKED 0x10
+/** Overlay buffer is locked. Application sets this flag and mustn't write to
+ *  the buffer again until vf_overlay clears it by setting BUFFER_UNLOCKED.
+ */
+#define BUFFER_LOCKED 0x20
+//@}
+
+
+
+/** \name Rectangle invalidation type flags.
+ *  \brief Indicates the type of invalidation that has occurred for a
+ *         given region.
+ *
+ * When the Overlay buffer is updated by a client, the invalidation type will
+ * be RECT_CONVERT | RECT_PREMULTIPLY. When only the global alpha is changed,
+ * however, there is no need to do a colorspace conversion, but alpha pre-
+ * multiplication is needed, so in that event, only RECT_PREMULTIPLY is
+ * used.
+ */
+//@{
+/// Region of the overlay buffer requires conversion to YV12A.
+#define RECT_CONVERT      0x01
+/// Region requires alpha premultiplication.
+#define RECT_PREMULTIPLY  0x02
+//@}
+
+
+
+/** Singly linked list of rectangles, specified by left/top coordinate and
+ *  width/height.
+ */
+struct rects {
+    int x, y, w, h, type;
+    struct rects *next;
+};
+
+
+/// Per-instance private data.
+struct vf_priv_s {
+    // Memory allocated by alloc_overlay_data() for the YV12 converted state of
+    // the overlay image.
+    uint8_t *y,       ///< Luma plane
+            *u,       ///< Chroma (Cb) plane
+            *v,       ///< Chroma (Cr) plane
+            *a,       ///< Alpha plane for luma channel
+            *uva,     ///< Alpha plane for chroma channel
+            *pre_y,   ///< Luma plane with pre-alpha-multiplied pixels
+            *pre_u,   ///< Chroma (Cb) plane with pre-alpha-multiplied pixels
+            *pre_v,   ///< Chroma (Cr) plane with pre-alpha-multiplied pixels
+            *pre_a,   /**< Alpha plane for luma channel where pixels are
+                           averaged with global alpha */
+            *pre_uva; /**< Alpha plane for chroma channel where pixels are
+                           averaged with global alpha */
+
+    /** Lockbyte points to the first byte of the shared memory buffer which
+     *  is used for synchronization. See \a BUFFER_LOCKED and \a
+     *  BUFFER_UNLOCKED flags above.
+     */
+    volatile uint8_t *lockbyte;
+    /// Points to the BGRA image in shared memory (which is simply (lockbyte+16).
+    uint8_t *bgra_imgbuf;
+    /// BGR24 version of \a bgra_imgbuf
+    uint8_t *bgr24_imgbuf;
+    /// The alpha plane of \a bgra_imgbuf
+    uint8_t *alpha_imgbuf;
+
+    /// The shared memory id as gotten from shm_get().
+    int shm_id;
+    /// The shared memory key as given to vf_config (via command line).
+    key_t shm_key;
+
+    int w,           ///< Width of the overlay image (pre-scaled, display size)
+        h,           ///< Height of the overlay image
+        mpi_w,       ///< Width of the scaled overlay image (scaled to fit mpi)
+        mpi_h,       ///< Height of the scaled overlay image
+        mpi_stride,  ///< Stride of the scaled overlay image
+        slice_y,     ///< Y-offset of overlay where compositing will begin
+        slice_h,     ///< Height of the overlay buffer to composite
+        auto_slice,  ///< 1 if slice region should be calculated automatically
+        alpha,       ///< "Global" alpha level of overlay (0 <= alpha <= 256)
+        visible,     ///< Whether or not the overlay is visible
+        dirty,       ///< Whether or not the overlay has changed.
+        is_paused;   ///< 1 if the video is paused, 0 otherwise.
+
+    /** List of regions in the overlay image which have been updated and therefore
+     *  need to be converted from BGRA to YV12.
+     */
+    struct rects *invalid_rects;
+    /** Keep track of last update timestamp; we update the overlay up to about
+     *  30 times a second.
+     */
+    unsigned int last_update_time;
+    struct SwsContext *sws_bgr24,  ///< Scaler for BGR24 to YV12
+                      *sws_y800_l, ///< Scaler for luma alpha plane
+                      *sws_y800_c; ///< Scaler for chroma alpha plane
+    /// The last mpi that was given to put_image().
+    mp_image_t *last_mpi;
+};
+
+
+
+/**
+ * \brief Array of all vf_overlay instances private data.
+ *
+ * Keep track of filter instances private data because the overlay buffer
+ * should be able to survive a loadfile or loop, so when the filter is
+ * initialized, we first check to see if we have an existing filter
+ * associated with the specified shared memory key and use that instead.
+ *
+ * As a result, vf_overlay instances are "persistent" (i.e., they don't get
+ * uninitialized). Consequently, the global variables below apply to all
+ * vf_overlay instances.
+ */
+static struct vf_priv_s **vf_overlay_priv = NULL;
+/// Number of vf_overlay instances
+static int num_instances = 0;
+
+#ifdef HAVE_MMX
+static uint64_t attribute_used __attribute__((aligned(8))) MM_global_alpha;
+static uint64_t attribute_used __attribute__((aligned(8))) MM_ROUND = C64(0x80);
+#endif
+
+#ifdef STOPWATCH
+/**
+ * \brief Simple timer for profiling and debugging.
+ *
+ * \param n Identifier of this stopwatch, where 0 < n < 10.
+ * \param text NULL to start the stopwatch, and a printf-style formatted
+ *             string to stop the stopwatch.
+ */
+static void
+stopwatch(int n, char *text, ...)
+{
+    va_list ap;
+    static struct {
+        unsigned int time, last_time;
+        char text[250];
+    } t[10];
+
+    if (n > STOPWATCH)
+        return;
+
+    t[n].time = GetTimer();
+    if (!text) {
+        fprintf(stderr, "@@@ Stopwatch (%d): %s: %d usec\n", n, t[n].text,
+                t[n].time - t[n].last_time);
+    } else {
+        t[n].last_time = t[n].time;
+
+        va_start(ap, text);
+        vsprintf(t[n].text, text, ap);
+        va_end(ap);
+    }
+}
+#else
+#define stopwatch(n, text, ...)
+#endif
+
+
+
+/**
+ * \brief Allocate buffer to hold YV12 version of the overlay image.
+ *
+ * \param priv Private data for this filter instance.
+ * 
+ * \return 1 if allocation was successful, or 0 otherwise.
+ *
+ * priv->buffer is allocated based on the requested overlay width and height
+ * with all bytes set to 0.
+ */
+static int
+alloc_overlay_data(struct vf_priv_s *priv)
+{
+    int w = priv->mpi_stride, h = priv->mpi_h;
+
+    // Boilerplate
+    #define alloc(buf, size) \
+        buf = (uint8_t *)memalign(16, size); \
+        if (!buf) return 0
+
+    alloc(priv->y, w * h);
+    alloc(priv->u, w * h / 4);
+    alloc(priv->v, w * h / 4);
+    alloc(priv->a, w * h);
+    alloc(priv->uva, w * h / 4);
+
+    // Buffers for alpha-multiplied pixels
+    alloc(priv->pre_y, w * h);
+    alloc(priv->pre_u, w * h / 4);
+    alloc(priv->pre_v, w * h / 4);
+    alloc(priv->pre_a, w * h);
+    alloc(priv->pre_uva, w * h / 4);
+
+    alloc(priv->alpha_imgbuf, priv->w * priv->h);
+
+    /* Holds BGR24 version of the image buffer. We hold one extra byte
+     * because in convert_bgra_to_yv12a() we copy 4 bytes at a time, but
+     * offset only 3 bytes. This is faster than 3 (or 2) copies, but means
+     * we need an extra byte so we're staying within the allocated buffer.
+     */
+    alloc(priv->bgr24_imgbuf, priv->w * priv->h * 3 + 1);
+    return 1;
+}
+
+
+
+/**
+ * \brief Detach and delete shared memory segment and free overlay buffers.
+ *
+ * \param priv Private data for this filter instance.
+ */
+static void
+free_overlay_data(struct vf_priv_s *priv)
+{
+    struct shmid_ds shmemds;
+
+    // Boilerplate
+    #define dealloc(buf) \
+        if (buf) { \
+            free(buf); \
+            buf = 0; \
+        }
+
+    dealloc(priv->y);
+    dealloc(priv->u);
+    dealloc(priv->v);
+    dealloc(priv->a);
+    dealloc(priv->uva);
+    dealloc(priv->pre_y);
+    dealloc(priv->pre_u);
+    dealloc(priv->pre_v);
+    dealloc(priv->pre_a);
+    dealloc(priv->pre_uva);
+
+    dealloc(priv->alpha_imgbuf);
+    dealloc(priv->bgr24_imgbuf);
+
+    if (priv->sws_bgr24)
+        sws_freeContext(priv->sws_bgr24);
+    if (priv->sws_y800_l)
+        sws_freeContext(priv->sws_y800_l);
+    if (priv->sws_y800_c)
+        sws_freeContext(priv->sws_y800_c);
+    priv->sws_bgr24 = priv->sws_y800_l = priv->sws_y800_c = 0;
+
+    if (priv->shm_id > 0 && priv->lockbyte) {
+        shmctl(priv->shm_id, IPC_RMID, &shmemds);
+        shmdt((uint8_t *)priv->lockbyte);
+        priv->lockbyte = 0;
+    }
+}
+
+
+
+/** \brief Free all buffers for all overlay filter instances.
+ *
+ * Because vf_overlay instances must survive a loadfile or loop, vf_uninit is
+ * not specified. Therefore, when the first vf_overlay instance is created,
+ * this function is registered with atexit(3), so that the shared memory
+ * segment allocated in vf_config is properly deleted. The overlay buffers are
+ * also freed in the call to free_overlay_data. (Although this is not strictly
+ * necessary since we are shutting down at this point, it is called for
+ * correctness.)
+ */
+static void
+free_all_overlay()
+{
+    int i;
+
+    if (vf_overlay_priv)
+        for (i = 0; i < num_instances; i++)
+            free_overlay_data(vf_overlay_priv[i]);
+}
+
+
+
+/**
+ * \brief Determines if it's ok for the overlay to update based on a rough
+ *        update rate of about 30 times a second.
+ *
+ * \param priv Private data for this filter instance.
+ * \param set If 1, it means the caller intends to update the overlay if this
+ *            function returns 1, in which case last_update_time is set
+ *            to the current time.
+ */
+static int
+should_update(struct vf_priv_s *priv, int set)
+{
+    unsigned int time = GetTimerMS();
+
+    // This results in updates about 30 times a second, give or take.
+    if (time - priv->last_update_time > 25 || time < priv->last_update_time) {
+        if (set)
+            priv->last_update_time = time;
+        return 1;
+    }
+    return 0;
+}
+
+
+
+/**
+ * \brief Checks to see if the next filter accepts YV12 images.
+ */
+static int
+query_format(struct vf_instance_s* vf, unsigned int fmt)
+{
+    if (fmt == IMGFMT_YV12)
+        return vf_next_query_format(vf, fmt);
+    return 0;
+}
+
+
+
+/**
+ * \brief Configure the filter and call the next filter's config function.
+ */
+static int
+config(struct vf_instance_s* vf, int width, int height, int d_width, int d_height,
+       unsigned int flags, unsigned int fmt)
+{
+    struct vf_priv_s *priv = vf->priv;
+    char *accel_str;
+    uint8_t *imgbuf;
+    int bufsize;
+
+    priv->is_paused = 0;
+
+    if (priv->bgra_imgbuf) {
+        // Already initialized; doing a loadfile or a loop.
+        if (priv->w == d_width && priv->h == d_height && priv->mpi_w == width && priv->mpi_h == height) {
+            mp_msg(MSGT_VFILTER, MSGL_INFO, "overlay: reusing existing buffer (%dx%d BGRA)\n", priv->w, priv->h);
+            return vf_next_config(vf, width, height, d_width, d_height, flags, fmt);
+        }
+        // Overlay size is different, so we need to resize. First free existing
+        // buffers.
+        free_overlay_data(priv);
+    }
+
+    priv->w = (d_width + 1) & ~1;
+    priv->h = (d_height + 1) & ~1;
+    // Automatically calculate slice by default.
+    priv->auto_slice = 1;
+    priv->mpi_w = (width + 1) & ~1;
+    priv->mpi_h = (height + 1) & ~1;
+    priv->mpi_stride = priv->mpi_w;
+
+    if (!alloc_overlay_data(priv))
+        return 0;
+
+    priv->sws_bgr24 = sws_getContext(priv->w, priv->h, PIX_FMT_BGR24, priv->mpi_w, priv->mpi_h, 
+                                     PIX_FMT_YUV420P, get_sws_cpuflags() | SWS_BICUBIC, NULL, NULL, NULL);
+    priv->sws_y800_l = sws_getContext(priv->w, priv->h, PIX_FMT_GRAY8, priv->mpi_w, priv->mpi_h, 
+                                      PIX_FMT_GRAY8, get_sws_cpuflags() | SWS_BICUBIC, NULL, NULL, NULL);
+    priv->sws_y800_c = sws_getContext(priv->w, priv->h, PIX_FMT_GRAY8, priv->mpi_w >> 1, priv->mpi_h >> 1, 
+                                      PIX_FMT_GRAY8, get_sws_cpuflags() | SWS_BICUBIC, NULL, NULL, NULL);
+
+    // 1 lock byte + 15 padding bytes + 32bpp
+    bufsize = 16 + priv->w * priv->h * 4;
+
+    priv->shm_id = shmget(priv->shm_key, bufsize, IPC_CREAT | 0600);
+    if (priv->shm_id < 0) {
+        mp_msg(MSGT_VFILTER, MSGL_ERR, "overlay: ERROR: unable to open shmem (%d): %s\n", 
+               priv->shm_key, strerror(errno));
+        return 0;
+    }
+    imgbuf = shmat(priv->shm_id, NULL, 0);
+    if (!imgbuf) {
+        mp_msg(MSGT_VFILTER, MSGL_ERR, "overlay: ERROR: couldn't mmap %d bytes from shmem (%d): %s\n", 
+               bufsize, priv->shm_key, strerror(errno));
+        return 0;
+    }
+
+    // Start with overlay hidden.
+    priv->visible = 0;
+    priv->alpha = 255;
+    priv->lockbyte = imgbuf;
+    priv->bgra_imgbuf = imgbuf + 16;
+    *priv->lockbyte = BUFFER_UNLOCKED;
+
+    accel_str = "no acceleration";
+#ifdef HAVE_MMX
+    if (gCpuCaps.hasMMX)
+        accel_str = "MMX accelerated";
+#endif
+
+    mp_msg(MSGT_VFILTER, MSGL_INFO, "overlay: %dx%d BGRA (frame %dx%d); shmem key: %u; %s.\n",
+           priv->w, priv->h, width, height, vf->priv->shm_key, accel_str);
+
+    return vf_next_config(vf, priv->mpi_w, priv->mpi_h, priv->w, priv->h, flags, fmt);
+}
+
+
+
+/**
+ * \brief Translates coordinates from overlay image to mpi.
+ *
+ * \param x Pointer to left coordinate of overlay
+ * \param y Pointer to top coordinate of overlay
+ * \param w Pointer to width relative to overlay
+ * \param h Pointer to height relative to overlay
+ *
+ * This function maps the passed coordinates from overlay to mpi. The new
+ * values are returned through the pointers. Any of the parameters may
+ * safely be NULL.
+ */
+static inline void
+translate_coords(struct vf_priv_s *priv, int *x, int *y, int *w, int *h)
+{
+    float xdiff = (float)priv->w / priv->mpi_w,
+          ydiff = (float)priv->h / priv->mpi_h;
+
+    if (x) *x = (int)((float)*x / xdiff);
+    if (w) { 
+        *w = (int)((float)*w / xdiff);
+        if (*w > priv->mpi_w)
+            *w = priv->mpi_w;
+    }
+    if (y) *y = (int)((float)*y / ydiff);
+    if (h) {
+        *h = (int)((float)*h / ydiff);
+        if (*h > priv->mpi_h)
+            *h = priv->mpi_h;
+    }
+
+}
+
+/**
+ * \brief Automatically determines slice region.
+ * 
+ * This function uses the chroma alpha plane to determine the slice region
+ * for blending the overlay.  Rather than blending the entire overlay, only
+ * the calculated slice is blended.  Autoslice is enabled by default, but the
+ * user may manually specify a slice region, in which case this function will
+ * not be called.
+ */
+static void
+calculate_slice(struct vf_priv_s *priv)
+{
+    int x, y, h, row_stride, slice_y1 = -2, slice_y2 = -2;
+    uint8_t *p;
+
+    p = priv->uva;
+    row_stride = priv->mpi_stride >> 1;
+    h = priv->mpi_h >> 1;
+
+    stopwatch(3, "calculate_slice");
+
+    #define check_opaque(type) \
+            if (*(type*)(p + x)) { \
+                if (slice_y1 == -2) \
+                    slice_y1 = y; \
+                else \
+                    slice_y2 = y; \
+                x = row_stride; \
+                break; \
+            }
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < row_stride-7; x += 8)
+            check_opaque(uint64_t);
+        for (; x < row_stride-3; x += 4)
+            check_opaque(uint32_t);
+        for (; x < row_stride-1; x += 2)
+            check_opaque(uint16_t);
+        p += row_stride;
+    }
+    stopwatch(3, NULL);
+    priv->slice_y = clamp((slice_y1 - 2) * 2, 0, priv->mpi_h);
+    priv->slice_h = clamp((slice_y2 + 2) * 2, 0, priv->mpi_h) - priv->slice_y;
+}
+
+
+
+/**
+ * \brief Do colorspace conversion from BGRA to "YV12A".
+ *
+ * \param priv Private data for this filter instance.
+ * \param ry Top of region to convert.
+ * \param rh Height of the region to convert.
+ *
+ * Converts an BGRA image to YV12 plus two alpha planes representing the alpha
+ * for the luma and chroma planes, scaling the overlay image to fit the frame
+ * size if necessary.
+ */
+static void
+convert_bgra_to_yv12a(struct vf_priv_s *priv, int ry, int rh)
+{
+    int i, orig_y = ry, orig_h = rh, dst_y, dst_h, src_strides[3], dst_strides[3];
+    uint8_t *p_alpha, *p_bgr24, *p_bgr32, *src[3], *dst[3];
+
+    /* We need to adjust the y-offset of the slice such that the overlay
+     * y-offset and the post-scaled mpi y-offset are both even (otherwise we
+     * see distortion in the chroma planes).  If we do need to scale the 
+     * overlay (i.e. overlay height is not mpi height), then increase the
+     * slice by a few rows on either side to cover any blending by the scaler.
+     */
+    i = priv->h != priv->mpi_h ? 4 : 0;
+    do {
+        dst_y = ry = clamp((orig_y - i) & ~1, 0, priv->h);
+        dst_h = rh = clamp((orig_h + 1 + i*2) & ~1, 0, priv->h - ry);
+        translate_coords(priv, 0, &dst_y, 0, &dst_h);
+        i+=2;
+    } while (dst_y % 2 != 0 && i < ry);
+
+    stopwatch(5, "convert_bgra_to_yv12a (%d - %d) -> (%d - %d)", ry, rh, dst_y, dst_h);
+
+    // Decompose BGR32 into BGR24 plus alpha plane.
+    stopwatch(6, "decompose");
+    p_alpha = priv->alpha_imgbuf + (ry * priv->w);
+    p_bgr24 = priv->bgr24_imgbuf + (ry * priv->w * 3);
+    p_bgr32 = priv->bgra_imgbuf + (ry * priv->w * 4);
+
+    for (i = priv->w * rh; i > 0; i--, p_bgr32 += 4, p_alpha++, p_bgr24 += 3) {
+        // Moving 32 bits is faster than 3 separate assignments (or one 16
+        // bit and and one 8 bit move). The BGR24 buffer has one extra byte
+        // allocated to prevent an overrun.
+        *(uint32_t *)p_bgr24 = *(uint32_t *)p_bgr32;
+        *p_alpha = p_bgr32[3];
+    }
+
+    stopwatch(6, NULL);
+
+    // Source is BGR24 overlay image offset to top of slice.
+    src[0] = priv->bgr24_imgbuf + (ry * priv->w * 3);
+    src[1] = src[2] = 0;
+    src_strides[0] = priv->w * 3;
+    src_strides[1] = src_strides[2] = 0;
+
+    // Dest is YV12 buffers offset to top of slice.
+    dst[0] = priv->y + (dst_y * priv->mpi_stride);
+    dst[1] = priv->u + ((dst_y * priv->mpi_stride) >> 2);
+    dst[2] = priv->v + ((dst_y * priv->mpi_stride) >> 2);
+    dst_strides[0] = priv->mpi_stride;
+    dst_strides[1] = priv->mpi_stride >> 1;
+    dst_strides[2] = priv->mpi_stride >> 1;
+
+    // Scale BGR24 -> YV12 overlay image (without alpha)
+    sws_scale(priv->sws_bgr24, src, src_strides, 0, rh, dst, dst_strides);
+
+    // Source is overlay-sized alpha plane offset to top of slice.
+    src[0] = priv->alpha_imgbuf + (ry * priv->w);
+    src[1] = src[2] = 0;
+    src_strides[0] = priv->w;
+    src_strides[1] = src_strides[2] = 0;
+
+    // Dest is mpi-sized alpha for luma plane offset to top of slice.
+    dst[0] = priv->a + (dst_y * priv->mpi_stride);
+    dst[1] = dst[2] = 0;
+    dst_strides[0] = priv->mpi_stride;
+    dst_strides[1] = dst_strides[2] = 0;
+
+    // Scale Y800 -> Y800 (luma alpha)
+    sws_scale(priv->sws_y800_l, src, src_strides, 0, rh, dst, dst_strides);
+
+    // Dest is mpi-sized alpha for chroma plane offset to top of slice.
+    dst[0] = priv->uva + ((dst_y * priv->mpi_stride) >> 2);
+    dst_strides[0] = priv->mpi_stride >> 1;
+
+    // Scale Y800 -> Y800 (chroma alpha)
+    sws_scale(priv->sws_y800_c, src, src_strides, 0, rh, dst, dst_strides);
+
+    stopwatch(5, NULL);
+}
+
+
+
+/**
+ * \brief Adds a rectangle to the list of invalid regions for the overlay.
+ *        Rectangles within the list may overlap.
+ *
+ * \param priv Private data for this filter instance.
+ * \param x,y Top left coordinate of the invalid region
+ * \param w,h Width and height of the invalid region.
+ * \param type Type of invalidation. Can be one or both of RECT_CONVERT
+ *             (convert from BGRA to YV12A) and RECT_PREMULTIPLY (do alpha
+ *             premultiplication for that rectangle).
+ */
+static void
+invalidate_rect(struct vf_priv_s *priv, int x, int y, int w, int h, int type)
+{
+    struct rects *r, *p;
+
+    r = (struct rects *)malloc(sizeof(struct rects));
+    // Round coordinates down to multiples of 2.
+    r->x = x & ~1; r->y = y & ~1;
+    // Round sizes up to multiples of 2.
+    r->w = (w + 1) & ~1; r->h = (h + 1) & ~1;
+    r->type = type;
+    r->next = NULL;
+
+    //r->y=0;r->h=priv->h;
+    // Ensure coordinates are within the overlay image boundaries
+    if (r->x < 0)
+        r->x = 0;
+    else if (r->x > priv->w)
+        r->x = priv->w;
+    if (r->y < 0)
+        r->y = 0;
+    else if (r->y > priv->h)
+        r->y = priv->h;
+    if (r->w < 0)
+        r->w = 0;
+    else if (r->w > priv->w - r->x)
+        r->w = priv->w - r->x;
+    if (r->h < 0)
+        r->h = 0;
+    else if (r->h > priv->h - r->y)
+        r->h = priv->h - r->y;
+
+    if (!priv->invalid_rects) {
+        priv->invalid_rects = r;
+        return;
+    }
+
+    // Update any existing invalidated region to reflect the new type
+    for (p = priv->invalid_rects; p != NULL; p = p->next) {
+        if (p->x == x && p->y == y && p->w == w && p->h == h) {
+            p->type |= type;
+            return;
+        }
+    }
+    // Insert new rect at the front
+    r->next = priv->invalid_rects;
+    priv->invalid_rects = r;
+}
+
+
+
+/**
+ * \brief Alpha multiplication (approximates division by 255).
+ *
+ * \param r The color value.
+ * \param a The alpha level (0 <= a <= 255).
+ *
+ * \return The alpha-multiplied value.
+ */
+static inline uint8_t
+multiply_alpha(uint8_t r, uint8_t a)
+{
+    int temp = (r * a) + 0x80;
+    return ((temp + (temp >> 8)) >> 8);
+}
+
+
+
+/// Blends src on top of dst at the given alpha level.
+#define blend_byte(dst, src, alpha) multiply_alpha(dst, alpha) + src;
+
+
+/**
+ * \brief Alpha-multiplies a byte and stores the result.
+ *
+ * \param byte The byte to be multiplied.
+ * \param alpha The alpha level of byte.
+ * \param dst_byte Pointer to where the alpha-mulplied byte will be stored.
+ * \param dst_alpha Pointer to where the alpha value for that byte will be stored.
+ * \param global_alpha The global alpha level (for the whole overlay image).
+ *
+ * This function calculates the average of the per-pixel alpha and the global
+ * alpha, stores that resulting average in dst_alpha, alpha-multiplies the
+ * byte with that averaged alpha, and stores the alpha-multiplied byte into
+ * dst_byte.
+ */
+static inline void
+premultiply_alpha_byte(uint8_t byte, uint8_t alpha,
+                       uint8_t *dst_byte, uint8_t *dst_alpha,
+                       int global_alpha)
+{
+    uint8_t a = (global_alpha < 255) ? alpha * global_alpha >> 8 : alpha;
+    *dst_byte = multiply_alpha(byte, a);
+    *dst_alpha = 255-a;
+}
+
+
+
+/**
+ * \brief Alpha-multiplies 8 consecutive bytes. C version.
+ */
+static void
+premultiply_alpha_byte_8_C(uint8_t *byte, uint8_t *alpha,
+                           uint8_t *dst_byte, uint8_t *dst_alpha,
+                           int global_alpha)
+{
+    int i;
+    for (i = 0; i < 8; i++)
+        premultiply_alpha_byte(*(byte++), *(alpha++), dst_byte++, dst_alpha++, global_alpha);
+}
+
+
+
+#ifdef HAVE_MMX
+/**
+ * \brief Alpha-multiplies 8 consecutive bytes. MMX version.
+ */
+static void
+premultiply_alpha_byte_8_MMX(uint8_t *byte, uint8_t *alpha,
+                             uint8_t *dst_byte, uint8_t *dst_alpha,
+                             int global_alpha)
+{
+    asm volatile(
+        "pxor %%mm7, %%mm7\n\t"                // zero out %mm7
+        "pcmpeqb %%mm4, %%mm4\n\t"             // %mm4 = 255's
+        "movq (%3), %%mm5\n\t"        // %mm5 = alpha
+        "cmp $255, %4\n\t"           // don't apply layer alpha if it's 100% opaque
+        "je 42f\n\t"
+
+        // Modify alpha from image with layer alpha
+        "movq %%mm5, %%mm6\n\t"       // %mm6 = %mm5 = alpha
+        "punpcklbw %%mm7, %%mm5\n\t"  // %mm5 = low dword of alpha
+        "punpckhbw %%mm7, %%mm6\n\t"  // %mm6 = hi dword of alpha
+        "pmullw "MANGLE(MM_global_alpha)", %%mm5\n\t"  // alpha * global_alpha
+        "pmullw "MANGLE(MM_global_alpha)", %%mm6\n\t"
+        "psrlw $8, %%mm5\n\t"         // Divide by 256
+        "psrlw $8, %%mm6\n\t"
+        "packuswb %%mm6, %%mm5\n\t"   // Pack back into %mm5
+
+        "42: \n\t"
+        "movq %%mm4, %%mm6\n\t"       // %mm4 = %mm6 = 255
+        "psubb %%mm5, %%mm6\n\t"      // %mm6 = 255 - alpha
+        "movq %%mm6, (%1)\n\t"        // save modified alpha
+
+        // Do alpha * bytes
+        "movq (%2), %%mm0\n\t"        // %mm0 = byte
+        "movq %%mm0, %%mm1\n\t"       // %mm1 = byte
+        "punpcklbw %%mm7, %%mm0\n\t"  // %mm0 = low dword of bytes
+        "punpckhbw %%mm7, %%mm1\n\t"  // %mm1 = hi dword of bytes
+        "movq %%mm5, %%mm6\n\t"       // %mm5 = %mm6 = alpha
+        "punpcklbw %%mm7, %%mm5\n\t"  // %mm5 = low dword alpha
+        "punpckhbw %%mm7, %%mm6\n\t"  // %mm6 = hi dword alpha
+        "pmullw %%mm5, %%mm0\n\t"     // alpha * bytes = (r*a)
+        "pmullw %%mm6, %%mm1\n\t"
+        // approximate division by 255
+        "movq "MANGLE(MM_ROUND)", %%mm6\n\t"   // %mm4 = round
+        "paddw %%mm6, %%mm0\n\t"      // (r*a) + 0x80
+        "paddw %%mm6, %%mm1\n\t"
+        "movq %%mm0, %%mm2\n\t"       // temp = (r*a) + 0x80
+        "movq %%mm1, %%mm3\n\t"
+        "psrlw $8, %%mm0\n\t"         // temp >> 8
+        "psrlw $8, %%mm1\n\t"
+        "paddw %%mm2, %%mm0\n\t"      // temp + (temp >> 8)
+        "paddw %%mm3, %%mm1\n\t"
+        "psrlw $8, %%mm0\n\t"         // (temp+(temp>>8))>>8
+        "psrlw $8, %%mm1\n\t"
+
+        "packuswb %%mm1, %%mm0\n\t"
+        "movq %%mm0, (%0)\n\t"
+    :  "+r" (dst_byte),             // %0
+       "+r" (dst_alpha)             // %1
+    :  "r" (byte),                  // %2
+       "r" (alpha),                 // %3
+       "r" (global_alpha));         // %4
+}
+#endif
+
+
+
+/**
+ * \brief Alpha-multiplies 8 consecutive bytes.
+ *
+ * This function pointer is set during vf_open and is set to either
+ * premultiply_alpha_byte_8_C or premultiply_alpha_byte_8_MMX depending
+ * on CPU capabilities.
+ */
+static void
+(*premultiply_alpha_byte_8)(uint8_t *byte, uint8_t *alpha,
+                            uint8_t *dst_byte, uint8_t *dst_alpha,
+                            int global_alpha);
+
+
+
+/**
+ * \brief Pre-alpha-multiply all pixels of the YV12A overlay image in the
+ *        specified region.
+ *
+ * \param priv Private data for this filter instance.
+ * \param rx,ry Top left coordinate of region to premultiply.
+ * \param rw,rh Width and height of region to premultiply.
+ */
+static void
+image_premultiply_alpha(struct vf_priv_s *priv, int rx, int ry, int rw, int rh)
+{
+    int w = priv->mpi_stride, global_alpha = priv->alpha;
+    uint8_t *y_ptr, *u_ptr, *v_ptr, *a_ptr, *uva_ptr,
+            *pre_y_ptr, *pre_u_ptr, *pre_v_ptr, *pre_a_ptr, *pre_uva_ptr;
+    int luma_offset, chroma_offset;
+    int x, y, chroma_stride;
+
+    stopwatch(4, "premultiply_alpha (%d,%d %dx%d)", rx, ry, rw, rh);
+
+    translate_coords(priv, &rx, &ry, &rw, &rh);
+
+    if (global_alpha > 255)
+        global_alpha = 255;
+
+    luma_offset = rx + ry*w;
+    chroma_offset = (rx>>1) + (ry>>1)*(w>>1);
+
+    y_ptr = priv->y + luma_offset;
+    u_ptr = priv->u + chroma_offset;
+    v_ptr = priv->v + chroma_offset;
+    a_ptr = priv->a + luma_offset;
+    uva_ptr = priv->uva + chroma_offset;
+
+    pre_y_ptr = priv->pre_y + luma_offset;
+    pre_u_ptr = priv->pre_u + chroma_offset;
+    pre_v_ptr = priv->pre_v + chroma_offset;
+    pre_a_ptr = priv->pre_a + luma_offset;
+    pre_uva_ptr = priv->pre_uva + chroma_offset;
+
+#ifdef HAVE_MMX
+    if (gCpuCaps.hasMMX)
+        MM_global_alpha = C64(global_alpha);
+#endif
+
+    chroma_stride = w >> 1;
+    for (y = 0; y < rh; y += 2) {
+        for (x = 0; x < (rw & ~7); x += 8)
+            premultiply_alpha_byte_8(&y_ptr[x], &a_ptr[x], &pre_y_ptr[x], &pre_a_ptr[x], global_alpha);
+        for (; x < rw; x++)
+            premultiply_alpha_byte(y_ptr[x], a_ptr[x], &pre_y_ptr[x], &pre_a_ptr[x], global_alpha);
+
+        for (x = 0; x < ((rw >> 1) & ~7); x += 8) {
+            premultiply_alpha_byte_8(&u_ptr[x], &uva_ptr[x], &pre_u_ptr[x], &pre_uva_ptr[x], global_alpha);
+            premultiply_alpha_byte_8(&v_ptr[x], &uva_ptr[x], &pre_v_ptr[x], &pre_uva_ptr[x], global_alpha);
+        }
+        for (; x < rw >> 1; x++) {
+            premultiply_alpha_byte(u_ptr[x], uva_ptr[x], &pre_u_ptr[x], &pre_uva_ptr[x], global_alpha);
+            premultiply_alpha_byte(v_ptr[x], uva_ptr[x], &pre_v_ptr[x], &pre_uva_ptr[x], global_alpha);
+        }
+        y_ptr += w;
+        u_ptr += chroma_stride;
+        v_ptr += chroma_stride;
+        a_ptr += w;
+        uva_ptr += chroma_stride;
+
+        pre_y_ptr += w;
+        pre_u_ptr += chroma_stride;
+        pre_v_ptr += chroma_stride;
+        pre_a_ptr += w;
+        pre_uva_ptr += chroma_stride;
+
+        for (x = 0; x < (rw & ~7); x += 8)
+            premultiply_alpha_byte_8(&y_ptr[x], &a_ptr[x], &pre_y_ptr[x], &pre_a_ptr[x], global_alpha);
+        for (; x < rw; x++)
+            premultiply_alpha_byte(y_ptr[x], a_ptr[x], &pre_y_ptr[x], &pre_a_ptr[x], global_alpha);
+
+        y_ptr += w;
+        a_ptr += w;
+        pre_y_ptr += w;
+        pre_a_ptr += w;
+    }
+#ifdef HAVE_MMX
+    if (gCpuCaps.hasMMX)
+        asm volatile( "emms\n\t" ::: "memory" );
+#endif
+    stopwatch(4, NULL);
+}
+
+
+
+/**
+ * \brief Blends one plane of the overlay onto the mpi in the given slice.
+ *        C version.
+ *
+ * \param w Width to blend (either the width of the overlay or the width of the
+ *          mpi, whichever is smaller).
+ * \param slice_h Number of rows to blend.
+ * \param dst Pointer to the buffer that will receive all blended bytes
+ *            for this plane.
+ * \param src Pointer to the mpi plane buffer.
+ * \param overlay Pointer to the overlay plane buffer.
+ * \param alpha Pointer to the alpha for this plane.
+ * \param mpi_stride Stride for src
+ * \param dst_stride Stride for overlay
+ *
+ * No bounds checking is performed, so the caller is responsible for ensuring
+ * that all pointers are properly positioned and are sufficiently large.
+ */
+static void
+blend_plane_C(int w, int slice_h, uint8_t *dst, uint8_t *src,
+              uint8_t *overlay, uint8_t *alpha, int mpi_stride,
+              int dmpi_stride, int overlay_stride)
+{
+    int x, y;
+    for (y = 0; y < slice_h; y++) {
+        for (x = 0; x < w; x++)
+            *(dst + x) = blend_byte(*(src+x), *(overlay+x), *(alpha+x));
+        dst += dmpi_stride;
+        src += mpi_stride;
+        overlay += overlay_stride;
+        alpha += overlay_stride;
+    }
+}
+
+
+
+#ifdef HAVE_MMX
+/**
+ * \brief Blends one plane of the overlay onto the mpi in the given slice.
+ *        MMX version.
+ * \see blend_plane_C for parameter details
+ */
+static void
+blend_plane_MMX(int w, int slice_h, uint8_t *dst, uint8_t *src,
+                uint8_t *overlay, uint8_t *alpha, int mpi_stride,
+                int dmpi_stride, int overlay_stride)
+{
+    int i, y, q = w / 8, r = w % 8;
+
+    for (y = 0; y < slice_h; y++) {
+        if (q) {
+            asm volatile(
+                "xor %%"REG_c", %%"REG_c"\n\t"
+
+                "1: \n\t"
+                "movq (%1, %%"REG_c"), %%mm0\n\t"  // %mm0 = mpi
+                "movq %%mm0, %%mm1\n\t"            // %mm1 = mpi
+                "movq (%3, %%"REG_c"), %%mm2\n\t"  // %mm2 = %mm3 = 255 - alpha
+                "movq %%mm2, %%mm3\n\t"
+
+                "punpcklbw %%mm7, %%mm0\n\t"  // %mm0 = low dword of mpi
+                "punpckhbw %%mm7, %%mm1\n\t"  // %mm1 = hi dword of mpi
+                "punpcklbw %%mm7, %%mm2\n\t"  // %mm0 = low dword of 255-a
+                "punpckhbw %%mm7, %%mm3\n\t"  // %mm1 = hi dword of 255-a
+                "pmullw %%mm2, %%mm0\n\t"     // (255-a) * mpi = (r*a)
+                "pmullw %%mm3, %%mm1\n\t"
+                // approximate division by 255
+                "paddw %%mm5, %%mm0\n\t"      // (r*a) + 0x80
+                "paddw %%mm5, %%mm1\n\t"
+                "movq %%mm0, %%mm2\n\t"       // temp = (r*a) + 0x80
+                "movq %%mm1, %%mm3\n\t"
+                "psrlw $8, %%mm0\n\t"         // temp >> 8
+                "psrlw $8, %%mm1\n\t"
+                "paddw %%mm2, %%mm0\n\t"      // temp + (temp >> 8)
+                "paddw %%mm3, %%mm1\n\t"
+                "psrlw $8, %%mm0\n\t"         // (temp+(temp>>8))>>8
+                "psrlw $8, %%mm1\n\t"
+
+                // MPI plane now alpha-multiplied. Add to premultiplied
+                // overlay plane.
+                "movq (%2, %%"REG_c"), %%mm2\n\t"  // %mm2 = src image (overlay)
+                "packuswb %%mm1, %%mm0\n\t"
+                "paddb %%mm2, %%mm0\n\t"
+                "movq %%mm0, (%0, %%"REG_c")\n\t"  // Store to dst (mpi)
+
+                "add $8, %%"REG_c"\n\t"
+                "cmp %4, %%"REG_c"\n\t"
+                "jb 1b \n\t"
+
+            : "+r" (dst),        // %0
+              "+r" (src),        // %1
+              "+r" (overlay),    // %2
+              "+r" (alpha)       // %3
+            : "m" (w)            // %4
+            : "%"REG_c);
+        }
+        // Blend the last few pixels of this row ...
+        if (r) {
+            for (i = 0; i < r; i++)
+                *(dst+i) = blend_byte(*(src+i), *(overlay+i), *(alpha+i));
+        }
+        src += mpi_stride;
+        dst += dmpi_stride;
+        alpha += overlay_stride;
+        overlay += overlay_stride;
+    }
+}
+#endif
+
+
+
+/**
+ * \brief Blends one plane of the overlay onto the mpi.
+ * \see blend_plane_C for parameter details.
+ *
+ * This function pointer is set during vf_open and is set to either
+ * blend_plane_C or blend_plane_MMX depending on CPU capabilities.
+ */
+static void
+(*blend_plane)(int w, int slice_h, uint8_t *dst, uint8_t *src,
+               uint8_t *overlay, uint8_t *alpha, int mpi_stride,
+               int dmpi_stride, int overlay_stride);
+
+
+
+/**
+ * \brief Blends the overlay onto the mpi.
+ *
+ * \param priv Private data for this filter instance.
+ * \param src_mpi The source mpi (as given the vf_put_image)
+ * \param dst_mpi The destination mpi.
+ *
+ * This function composites the overlay over the video mpi in the slice
+ * region specified in a slave command. (If no slice region has been
+ * explicitly set, it defaults to the whole overlay image.) If the global
+ * alpha is 256, the overlay is simply memcpy'd to the dst_mpi in the slice
+ * region, thus ignoring the per-pixel alpha values of the overlay (in that
+ * slice). (Pixels outside the slice are copied from src_mpi). If the global
+ * alpha is 255 or less, then each pixel of the overlay is composited over
+ * the src_mpi.
+ *
+ * The overlay is clipped to the dimensions of the mpi.
+ */
+static inline void
+blend_image(struct vf_priv_s *priv, mp_image_t *src_mpi, mp_image_t *dst_mpi)
+{
+    int slice_y, slice_h, w, i, c, plane, overlay_stride[3];
+    uint8_t *dst_mpi_planes[3], *src_mpi_planes[3], *overlay, *src, *dst, *alpha,
+            *overlay_planes[3] = { priv->pre_y, priv->pre_u, priv->pre_v },
+            *alpha_planes[3] = { priv->pre_a, priv->pre_uva, priv->pre_uva };
+
+    // Clip the slice to the mpi image.  Slice region is already translated
+    // to mpi size.
+    slice_y = priv->slice_y;
+    slice_h = priv->slice_h;
+
+    if (slice_y < 0)
+        slice_y = 0;
+    else if (slice_y > src_mpi->height)
+        slice_y = src_mpi->height;
+
+    if (slice_h < 0)
+        slice_h = 0;
+    else if (slice_h > src_mpi->height - slice_y)
+        slice_h = src_mpi->height - slice_y;
+
+    stopwatch(4, "blend_image (0,%d, %dx%d)",  slice_y, priv->mpi_w, slice_h);
+
+    for (i = 0, c = 0; i < 3; i++, c = 1)  {
+        // Setup buffer positions for overlay, mpi src and mpi dst.
+        overlay_stride[i] = priv->mpi_stride >> c;
+        dst_mpi_planes[i] = dst_mpi->planes[i] + ((slice_y >> c) * dst_mpi->stride[i]);
+        src_mpi_planes[i] = src_mpi->planes[i] + ((slice_y >> c) * src_mpi->stride[i]);
+        overlay_planes[i] += (slice_y >> c) * overlay_stride[i];
+        alpha_planes[i] += (slice_y >> c) * overlay_stride[i];
+
+        if (src_mpi == dst_mpi)
+            continue;
+
+        // If we're compositing only a slice, copy the parts of the mpi
+        // above and below the slice.
+        if (slice_y > 0)
+            // MPI above the overlay slice.
+            memcpy_pic(dst_mpi->planes[i], src_mpi->planes[i], src_mpi->w,  slice_y >> c,
+                       dst_mpi->stride[i], src_mpi->stride[i]);
+        if (slice_h >= 0 && slice_y + slice_h < src_mpi->height)
+            // MPI below the overlay slice.
+            memcpy_pic(dst_mpi->planes[i] + dst_mpi->stride[i] * ((slice_y+slice_h) >> c),
+                       src_mpi->planes[i] + src_mpi->stride[i] * ((slice_y+slice_h) >> c),
+                       src_mpi->w, (src_mpi->height-(slice_y+slice_h)) >> c, 
+                       dst_mpi->stride[i], src_mpi->stride[i]);
+    }
+
+#ifdef HAVE_MMX
+    if(gCpuCaps.hasMMX) {
+        asm volatile(
+            "pxor %%mm7, %%mm7\n\t"                // zero out %mm7
+            "movq "MANGLE(MM_ROUND)", %%mm5\n\t"   // %mm5 = round
+            ::: "memory"
+        );
+    }
+#endif
+
+    for (w = priv->mpi_stride, plane = 0; plane < 3; plane++) {
+        if (plane == 1) {
+            w >>= 1;
+            slice_h >>= 1;
+        }
+        overlay = overlay_planes[plane];
+        alpha = alpha_planes[plane];
+        src = src_mpi_planes[plane];
+        dst = dst_mpi_planes[plane];
+
+        // Global alpha is 256 which means ignore per-pixel alpha. Do
+        // straight memcpy.
+        if (priv->alpha == 256) {
+            memcpy_pic(dst, overlay, w, slice_h, dst_mpi->stride[plane], src_mpi->stride[plane]);
+        } else {
+            blend_plane(w, slice_h, dst, src, overlay, alpha, src_mpi->stride[plane], 
+                        dst_mpi->stride[plane], overlay_stride[plane]);
+        }
+    }
+
+#ifdef HAVE_MMX
+    if(gCpuCaps.hasMMX)
+        asm volatile( "emms\n\t" ::: "memory" );
+#endif
+    stopwatch(4, NULL);
+}
+
+
+
+/**
+ * \brief Process a frame.
+ *
+ * \param vf Instance of this filter.
+ * \param mpi The image sent by the previous filter (or decoder).
+ *
+ * \return The return code of the next filter, or 0 on error.
+ *
+ * This function is called when a new video frame is to be drawn or when the
+ * overlay needs updating. If the lockbyte of the overlay shared memory
+ * buffer is set to BUFFER_LOCKED, it means the controlling application has
+ * made changes that need processing. All invalidated rectangles are
+ * converted from BGRA to YV12(A) and those regions are pre-alpha-multiplied.
+ * Subsequently, if the overlay is visible, it is composited over the mpi.
+ */
+static int
+put_image(struct vf_instance_s* vf, mp_image_t* mpi, double pts)
+{
+    mp_image_t *dmpi = NULL;
+
+    // Remember last mp image.
+    vf->priv->last_mpi = mpi;
+
+    /* If the controlling application has locked the buffer and issued a
+     * slave command (which causes dirty to be 1) and it's time to update
+     * (i.e. more than 1/30th of a second has elapsed), then we process the
+     * invalidated regions by doing BGRA -> YV12A conversion and pre-
+     * alpha-multiply those areas.
+     */
+    if (*vf->priv->lockbyte & BUFFER_LOCKED && vf->priv->dirty && should_update(vf->priv, 1)) {
+        struct rects *r, *next;
+        stopwatch(3, "putimage (convert)");
+        r = vf->priv->invalid_rects;
+        while (r) {
+            if (r->type & RECT_CONVERT)
+                convert_bgra_to_yv12a(vf->priv, r->y, r->h);
+            if (r->type & RECT_PREMULTIPLY)
+                image_premultiply_alpha(vf->priv, r->x, r->y, r->w, r->h);
+            next = r->next;
+            free(r);
+            r = next;
+        }
+        vf->priv->invalid_rects = 0;
+        // YV12A version is fully up-to-date now, we can unlock the BGRA
+        // buffer.
+        *vf->priv->lockbyte = BUFFER_UNLOCKED;
+        vf->priv->dirty = 0;
+
+        if (vf->priv->auto_slice)
+            calculate_slice(vf->priv);
+    } else
+        stopwatch(3, "putimage (no convert)");
+
+    if (vf->priv->visible != 0 && vf->priv->alpha > 0) {
+        // Overlay is visible.
+        dmpi = vf_get_image(vf->next, mpi->imgfmt, mpi->type, mpi->flags, vf->priv->mpi_w, vf->priv->mpi_h);
+        blend_image(vf->priv, mpi, dmpi);
+    } else if (!dmpi){
+        // Overlay is hidden, so do a shallow copy.
+        dmpi = vf_get_image(vf->next, mpi->imgfmt, MP_IMGTYPE_EXPORT, MP_IMGFLAG_PRESERVE, 
+                            vf->priv->mpi_w, vf->priv->mpi_h);
+        dmpi->planes[0] = mpi->planes[0];
+        dmpi->stride[0] = mpi->stride[0];
+        if (dmpi->flags & MP_IMGFLAG_PLANAR) {
+            dmpi->planes[1] = mpi->planes[1];
+            dmpi->stride[1] = mpi->stride[1];
+            dmpi->planes[2] = mpi->planes[2];
+            dmpi->stride[2] = mpi->stride[2];
+        }
+    }
+    stopwatch(3, NULL);
+    return vf_next_put_image(vf, dmpi, pts);
+}
+
+
+
+/**
+ * \brief Handle a slave command.
+ *
+ * \param cmd Structure holding the data for this command.
+ * \param paused The paused state of the video.
+ * \param priv Private data for this filter instance.
+ *
+ * This function is registered with a call to mp_input_add_cmd_filter() in
+ * vf_open and is used to handle MP_CMD_VF_OVERLAY (the "overlay" slave command),
+ * as well as track the pause state of the video.
+ *
+ * Slave command argument is a string in the form: cmd=args[,cmd=args[, ... ]]
+ * Possible commands are:
+ *
+ *     invalidate=x:y:w:h
+ *        Cause the specified rectangle to be updated on the overlay.
+ *        (Internally this forces BGRA->YV12A colorspace conversion.)
+ *     slice=y:h
+ *        Draw only the specified slice (top / height) of the overlay.  If
+ *        -1:-1 are specified, use autoslicing.
+ *     visible=val
+ *        Draw overlay if val is 1, or don't draw overlay if val is 0
+ *     alpha=val
+ *        Sets the global alpha level for the overlay. val==0 is
+ *        semantically equivalent to visible=0; 256 means don't alpha blend.
+ *
+ * See DOCS/tech/vf_overlay.txt for more details.
+ */
+static int
+cmd_filter(mp_cmd_t *cmd, int paused, struct vf_priv_s *priv)
+{
+    if (cmd->id == MP_CMD_VF_OVERLAY) {
+        char *p1, *p2, *args = cmd->args[0].v.s;
+        while (args && (p1 = strsep(&args, ","))) {
+            p2 = strpbrk(p1, "=,");
+            if (!p2 || *(p2+1) == 0)
+               // Command with no arguments, must be malformed.
+                continue;
+            *p2 = 0;
+
+            if (!strcasecmp(p1, "invalidate")) {
+                int x, y, w, h;
+                if (sscanf(p2+1, "%d:%d:%d:%d", &x, &y, &w, &h) == 4)
+                    invalidate_rect(priv, x, y, w, h, RECT_CONVERT | RECT_PREMULTIPLY);
+            }
+            else if (!strcasecmp(p1, "slice")) {
+                int y, h;
+                if (sscanf(p2+1, "%d:%d", &y, &h) == 2) {
+                    if (y == -1 || h == -1) {
+                        priv->auto_slice = 1;
+                        calculate_slice(priv);
+                    } else {
+                        translate_coords(priv, 0, &y, 0, &h);
+                        priv->slice_y = y;
+                        priv->slice_h = h;
+                        priv->auto_slice = 0;
+                    }
+                }
+            }
+            else if (!strcasecmp(p1, "alpha")) {
+                int alpha;
+                if (sscanf(p2+1, "%d", &alpha) == 1 && alpha != priv->alpha) {
+                    priv->alpha = alpha;
+                    fprintf(stderr, "@@@ ALPHA: %d\n", alpha);
+                    invalidate_rect(priv, 0, 0, priv->w, priv->h, RECT_PREMULTIPLY);
+                }
+            }
+            else if (!strcasecmp(p1, "visible")) {
+                sscanf(p2+1, "%d", &priv->visible);
+            }
+            priv->dirty = 1;
+        }
+        // This command is handled, so return 1.  This causes mp_input_get_cmd
+        // to return NULL and if we're paused, it keeps us paused.
+        return 1;
+   }
+
+    if (cmd->id == MP_CMD_PAUSE)
+        priv->is_paused = !paused;
+
+    return 0;
+}
+
+
+
+/**
+ * \brief Handle VFCTRL commands.
+ *
+ * \param vf Instance of this filter.
+ * \param request The VFCTRL_* request to handle.
+ * \param data The data for the given VFCTRL command.
+ *
+ * \return The return value of the next filter.
+ *
+ * This function handles VFCTRL_PERIODIC_UPDATE which is called inside the pause
+ * and sleep looPS in mplayer.c.
+ */
+static int
+control(struct vf_instance_s *vf, int request, void *data)
+{
+    /** \bug FIXME: Can't update if we're using double buffering. This means for
+     *  double buffering the overlay update speed is only as fast as the video
+     *  frame rate.
+     */
+    if (request == VFCTRL_PERIODIC_UPDATE && (!vo_doublebuffering || vf->priv->is_paused)) {
+        //float time_avail = *(float *)data;
+        if (vf->priv->last_mpi && *vf->priv->lockbyte & BUFFER_LOCKED && should_update(vf->priv, 0)) {
+            // Process pending slave commands if we're not paused.
+            mp_input_get_cmd(0,0,1);
+            if (vf->priv->dirty) {
+                put_image(vf, vf->priv->last_mpi, 0);
+                // return CONTROL_TRUE to force page flip
+                return vf_next_control(vf, request, data), CONTROL_TRUE;
+            }
+        }
+        // Returns CONTROL_FALSE to consume the event unless a later filter
+        // returns CONTROL_TRUE.
+        return vf_next_control(vf, request, data) > 0 ? CONTROL_TRUE : CONTROL_FALSE;
+    }
+    return vf_next_control(vf, request, data);
+}
+
+
+/**
+ * \brief Initialize the overlay filter.
+ *
+ * \param vf Instance of this filter.
+ * \param args The arguments passed from the command line for this instance.
+ *             The only argment is an integer representing the shared memory
+ *             key.
+ *
+ * vf_overlay instances are intended to be "persistent"; in other words, they
+ * never get ununitialized. This is to allow overlay buffers to survive
+ * loadfile or a loop.
+ */
+static int
+open(vf_instance_t* vf, char* args)
+{
+    int i = 0;
+    key_t shm_key;
+
+    vf->config = config;
+    vf->put_image = put_image;
+    vf->query_format = query_format;
+    vf->control = control;
+    vf->uninit = NULL;  // persistent
+
+    if(!args || sscanf(args, "%u", &shm_key) < 1 ) {
+        mp_msg(MSGT_VFILTER, MSGL_ERR, "vf_overlay: bad args; usage: overlay=shmkey\n");
+        return 0;
+    }
+
+    // Check to see if we've already initialized a filter with this shmkey. If
+    // we have, then we reuse the private data, which allows image layers to
+    // survive a loadfile or a loop.
+    if (vf_overlay_priv) {
+        struct vf_priv_s *p;
+        for (i = 0, p = vf_overlay_priv[i]; i < num_instances; p = vf_overlay_priv[++i]) {
+            if (p->shm_key == shm_key) {
+                vf->priv = p;
+                return 1;
+            }
+        }
+    }
+    else {
+        /* Initial load; register with atexit to handle unlinking shmem
+         * objects. We don't do this with the filter's uninit because uninit
+         * gets called during a loadfile and we don't want to lose overlay data
+         * during loadfiles.
+         */
+        premultiply_alpha_byte_8 = premultiply_alpha_byte_8_C;
+        blend_plane = blend_plane_C;
+#ifdef HAVE_MMX
+        if(gCpuCaps.hasMMX) {
+            premultiply_alpha_byte_8 = premultiply_alpha_byte_8_MMX;
+            blend_plane = blend_plane_MMX;
+        }
+#endif
+        atexit(free_all_overlay);
+    }
+
+    // New filter, so create and initialize the private data
+    vf->priv = calloc(1, sizeof(struct vf_priv_s));
+    vf->priv->shm_key = shm_key;
+
+    mp_input_add_cmd_filter((mp_input_cmd_filter)cmd_filter, vf->priv);
+
+    // Grow the arrays and keep track of the private data, as well as the instance
+    // structs, used for handling the pause loop.
+    vf_overlay_priv = (struct vf_priv_s **)realloc(vf_overlay_priv, i + 1);
+    vf_overlay_priv[i] = vf->priv;
+    num_instances = i + 1;
+    return 1;
+}
+
+
+
+/**
+ * \brief Info about this filter for registering with vf.
+ */
+const vf_info_t vf_info_overlay = {
+    "Shared memory image overlay with alpha compositing",
+    "overlay",
+    "Jason Tackaberry",
+    "",
+    open,
+    NULL
+};
+
+#endif
Index: libmpcodecs/vf_outbuf.c
===================================================================
--- libmpcodecs/vf_outbuf.c	(revision 0)
+++ libmpcodecs/vf_outbuf.c	(revision 0)
@@ -0,0 +1,262 @@
+#include "config.h"
+
+#ifdef HAVE_SHM
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/shm.h>
+#include <sys/ipc.h>
+#include <fcntl.h>
+#include "mp_image.h"
+#include "vf.h"
+#include "img_format.h"
+
+#include <sys/mman.h>
+
+#include "mp_msg.h"
+#include "libvo/fastmemcpy.h"
+#include "libvo/video_out.h"
+#include "libswscale/swscale.h"
+#include "input/input.h"
+#include "osdep/timer.h"
+#include "libavutil/avutil.h"
+#include "vf_scale.h"
+
+// defined in fmt-conversion.h and included by vf_scale.c
+enum PixelFormat imgfmt2pixfmt(int fmt);
+
+// Lock flags.
+//
+#define BUFFER_UNLOCKED 0x10
+#define BUFFER_LOCKED 0x20
+
+
+struct vf_priv_s {
+    int mpi_w, mpi_h, active, mpifmt, buffmt,
+        dst_w, dst_h;
+    double aspect;
+    uint8_t *shmem;
+    int shm_id;
+    key_t shm_key;
+    struct SwsContext *sws;
+};
+
+static void setup_sws(struct vf_priv_s *priv)
+{
+    if (priv->sws)
+        sws_freeContext(priv->sws);
+
+    priv->sws = sws_getContext(priv->mpi_w, priv->mpi_h, imgfmt2pixfmt(priv->mpifmt), 
+                               priv->dst_w, priv->dst_h, imgfmt2pixfmt(priv->buffmt),
+                               get_sws_cpuflags()|SWS_PRINT_INFO|SWS_BILINEAR,
+                               NULL, NULL, NULL);
+}
+
+
+static int
+config(struct vf_instance_s* vf, int width, int height,
+       int d_width, int d_height, unsigned int flags, unsigned int mpifmt)
+{
+    int fd, size;
+    vf->priv->dst_w = vf->priv->mpi_w = (width+1) & ~1;
+    vf->priv->dst_h = vf->priv->mpi_h = (height+1) & ~1;
+    size = vf->priv->dst_w * vf->priv->dst_h * 4 + 16;
+
+    vf->priv->shm_id = shmget(vf->priv->shm_key, size, IPC_CREAT | 0600);
+    if (vf->priv->shm_id < 0) {
+        mp_msg(MSGT_VFILTER, MSGL_ERR, "\noutbuf: ERROR: unable to open shmem (key %d)\n", vf->priv->shm_key);
+        return 0;
+    }
+    vf->priv->shmem = shmat(vf->priv->shm_id, NULL, 0);
+    if (!vf->priv->shmem) {
+        mp_msg(MSGT_VFILTER, MSGL_ERR, "\noutbuf: ERROR: couldn't mmap %d bytes from shmem (%d)\n", size, vf->priv->shm_key);
+        return 0;
+    }
+
+    vf->priv->mpifmt = mpifmt;
+    vf->priv->active = 1;
+    vf->priv->aspect = (double)d_width/d_height;
+    mp_msg(MSGT_VFILTER, MSGL_INFO, "outbuf: %dx%d %s; shmem key: %u.\n",
+           width, height, vo_format_name(vf->priv->buffmt), vf->priv->shm_key);
+
+    setup_sws(vf->priv);
+    return vf_next_config(vf, width, height, d_width, d_height, flags, mpifmt);
+}
+
+
+static int
+query_format(struct vf_instance_s* vf, unsigned int fmt)
+{
+    if (fmt == IMGFMT_YV12)
+        return vf_next_query_format(vf, fmt);
+    return 0;
+}
+
+
+static int
+put_image(struct vf_instance_s* vf, mp_image_t* mpi, double pts) 
+{
+    struct { 
+        short lock, width, height;
+        double aspect;
+    } header = { 
+        .lock = BUFFER_UNLOCKED, 
+        .width = vf->priv->dst_w, 
+        .height = vf->priv->dst_h,
+        .aspect = vf->priv->aspect
+    };
+
+    struct timeval curtime;
+    struct timezone tz;
+    double start_time, now;
+
+    if (vf->priv->active == 0)
+        return 0;
+
+    if (vf->priv->active == 1) {
+        return vf_next_put_image(vf, mpi, pts);
+    }
+
+    if (vf->priv->mpi_w != mpi->width || vf->priv->mpi_h != mpi->height) {
+        vf->priv->mpi_w = mpi->width;
+        vf->priv->mpi_h = mpi->height;
+        setup_sws(vf->priv);
+    }
+
+    // Wait at most 0.1 seconds for the client to unlock the buffer.
+    gettimeofday(&curtime, &tz);
+    start_time = now = curtime.tv_sec + (curtime.tv_usec/(1000.0*1000));
+    while (*vf->priv->shmem & BUFFER_LOCKED && now - start_time < 0.1) {
+        gettimeofday(&curtime, &tz);
+        now = curtime.tv_sec + (curtime.tv_usec/(1000.0*1000));
+        usec_sleep(1);
+    }
+
+    if (!(*vf->priv->shmem & BUFFER_LOCKED) || (now - start_time >= 0.1)) 
+        fprintf(stderr, "@@@ outbuf timeout: lock=%d time=%f\n", *vf->priv->shmem & BUFFER_LOCKED, now-start_time);
+
+
+    if (now - start_time < 0.1) {
+        switch (vf->priv->buffmt) {
+            case IMGFMT_BGR32: {
+                uint8_t *dst[3]= {vf->priv->shmem + 16, NULL, NULL};
+                int dst_stride[3]= {vf->priv->dst_w * 4, 0, 0};
+                sws_scale(vf->priv->sws, mpi->planes, mpi->stride, 0, vf->priv->mpi_h, dst, dst_stride);
+                break;
+            }
+            case IMGFMT_YV12: {
+                uint8_t *dst[3];
+                int dst_stride[3], i, stride = vf->priv->dst_w, h = vf->priv->dst_h;
+                unsigned char *p = vf->priv->shmem + 16;
+
+                for (i = 0; i < 3; p += stride * h, i++) {
+                    if (i == 1) { stride >>= 1; h >>= 1;}
+                    dst[i] = p;
+                    dst_stride[i] = stride;
+                }
+                sws_scale(vf->priv->sws, mpi->planes, mpi->stride, 0, vf->priv->mpi_h, dst, dst_stride);
+                break;
+            }
+        }
+        memcpy(vf->priv->shmem, &header, sizeof(header));
+        *vf->priv->shmem = BUFFER_LOCKED;
+    } 
+
+    if (vf->priv->active == 3)
+        return vf_next_put_image(vf, mpi, pts);
+
+    return 0;
+
+}
+        
+static int 
+cmd_filter(mp_cmd_t* cmd, int paused, struct vf_priv_s * priv) 
+{
+    if (cmd->id == MP_CMD_VF_OUTBUF) {
+        int w = priv->dst_w, h = priv->dst_h;
+        priv->active = cmd->args[0].v.i;
+        if (cmd->nargs > 1)
+            w = cmd->args[1].v.i ? cmd->args[1].v.i : priv->mpi_w;
+        if (cmd->nargs > 2)
+            h = cmd->args[2].v.i ? cmd->args[2].v.i : priv->mpi_h;
+
+        w = (w > priv->mpi_w) ? priv->mpi_w : w;
+        h = (h > priv->mpi_h) ? priv->mpi_h : h;
+        if (w != priv->dst_w || h != priv->dst_h) {
+            priv->dst_w = (w+1) & ~1;
+            priv->dst_h = (h+1) & ~1;
+            setup_sws(priv);
+        }
+
+        return 1;
+    }
+    return 0;
+}
+
+static void uninit(struct vf_instance_s *vf)
+{
+    struct shmid_ds shmemds;
+
+    sws_freeContext(vf->priv->sws);
+    shmctl(vf->priv->shm_id, IPC_RMID, &shmemds);
+    shmdt(vf->priv->shmem);
+
+    free(vf->priv);
+}
+
+
+static int
+vf_open(vf_instance_t* vf, char* args)
+{
+    char *p;
+    int argn = 0;
+
+    vf->config = config;
+    vf->put_image = put_image;
+    vf->query_format = query_format;
+    vf->uninit = uninit;
+
+    vf->priv = calloc(1, sizeof(struct vf_priv_s));
+    vf->priv->active = 1;
+    vf->priv->buffmt = IMGFMT_YV12;
+
+    while ((p = strsep(&args, ":"))) {
+        //if (argn == 0) strcpy(vf->priv->shmem_name, p);
+        if (argn == 0) vf->priv->shm_key = atoi(p);
+        else if (argn == 1) {
+            if (!strcasecmp(p, "bgr32"))
+                vf->priv->buffmt = IMGFMT_BGR32;
+            else if (!strcasecmp(p, "yv12"))
+                vf->priv->buffmt = IMGFMT_YV12;
+            else {
+                mp_msg(MSGT_VFILTER, MSGL_ERR, "\noutbuf: ERROR: invalid format '%s'\n", p);
+                return 0;
+            }
+        }
+        else if (argn == 2) 
+            vf->priv->active = atoi(p);
+        argn++;
+    }
+        
+    mp_input_add_cmd_filter((mp_input_cmd_filter)cmd_filter, vf->priv);
+    return 1;
+}
+
+
+const vf_info_t vf_info_outbuf = {
+    "Write video frame to shared memory",
+    "outbuf",
+    "Jason Tackaberry",
+    "",
+    vf_open,
+    NULL
+};
+
+#endif
Index: libmpcodecs/vf.c
===================================================================
--- libmpcodecs/vf.c	(revision 27232)
+++ libmpcodecs/vf.c	(working copy)
@@ -99,6 +99,10 @@
 extern const vf_info_t vf_info_blackframe;
 extern const vf_info_t vf_info_geq;
 extern const vf_info_t vf_info_ow;
+#ifdef HAVE_SHM
+extern vf_info_t vf_info_overlay;
+extern vf_info_t vf_info_outbuf;
+#endif
 
 // list of available filters:
 static const vf_info_t* const filter_list[]={
@@ -190,6 +194,10 @@
 #endif
     &vf_info_yadif,
     &vf_info_blackframe,
+#ifdef HAVE_SHM
+    &vf_info_overlay,
+    &vf_info_outbuf,
+#endif
     &vf_info_ow,
     NULL
 };
Index: libmpcodecs/vf.h
===================================================================
--- libmpcodecs/vf.h	(revision 27232)
+++ libmpcodecs/vf.h	(working copy)
@@ -70,6 +70,7 @@
     int value;
 } vf_equalizer_t;
 
+#define VFCTRL_NOTIFY_PTS         99
 #define VFCTRL_QUERY_MAX_PP_LEVEL 4 /* test for postprocessing support (max level) */
 #define VFCTRL_SET_PP_LEVEL 5 /* set postprocessing level */
 #define VFCTRL_SET_EQUALIZER 6 /* set color options (brightness,contrast etc) */
@@ -86,6 +87,7 @@
 #define VFCTRL_GET_PTS         17 /* Return last pts value that reached vf_vo*/
 #define VFCTRL_SET_DEINTERLACE 18 /* Set deinterlacing status */
 #define VFCTRL_GET_DEINTERLACE 19 /* Get deinterlacing status */
+#define VFCTRL_PERIODIC_UPDATE 20 /* Called whenever MPlayer is idle */
 
 #include "vfcap.h"
 
Index: input/input.c
===================================================================
--- input/input.c	(revision 27232)
+++ input/input.c	(working copy)
@@ -191,7 +191,12 @@
   
   { MP_CMD_SEEK_CHAPTER, "seek_chapter", 1, { {MP_CMD_ARG_INT,{0}}, {MP_CMD_ARG_INT,{0}}, {-1,{0}} } },
   { MP_CMD_SET_MOUSE_POS, "set_mouse_pos", 2, { {MP_CMD_ARG_INT,{0}}, {MP_CMD_ARG_INT,{0}}, {-1,{0}} } },
-  
+
+#ifdef HAVE_SHM
+  { MP_CMD_VF_OVERLAY, "overlay", 1, { {MP_CMD_ARG_STRING,{0}}, {-1,{0}}}},
+  { MP_CMD_VF_OUTBUF, "outbuf", 1, { {MP_CMD_ARG_INT,{0}}, {-1,{0}}}},
+#endif
+
   { 0, NULL, 0, {} }
 };
 
Index: input/input.h
===================================================================
--- input/input.h	(revision 27232)
+++ input/input.h	(working copy)
@@ -3,6 +3,8 @@
 
 // All command IDs
 typedef enum {
+  MP_CMD_VF_OVERLAY,
+  MP_CMD_VF_OUTBUF,
   MP_CMD_SEEK,
   MP_CMD_AUDIO_DELAY,
   MP_CMD_QUIT,
Index: Makefile
===================================================================
--- Makefile	(revision 27232)
+++ Makefile	(working copy)
@@ -140,6 +140,8 @@
               libmpcodecs/vf_mirror.c \
               libmpcodecs/vf_noformat.c \
               libmpcodecs/vf_noise.c \
+              libmpcodecs/vf_outbuf.c \
+              libmpcodecs/vf_overlay.c \
               libmpcodecs/vf_ow.c \
               libmpcodecs/vf_palette.c \
               libmpcodecs/vf_perspective.c \
Index: DOCS/tech/slave.txt
===================================================================
--- DOCS/tech/slave.txt	(revision 27232)
+++ DOCS/tech/slave.txt	(working copy)
@@ -195,6 +195,10 @@
 osd_show_text <string> [duration] [level]
     Show <string> on the OSD.
 
+overlay <command>
+    Manipulate the overlay filter.  See DOCS/tech/vf_overlay.txt for a
+    detailed description of what commands are available.
+
 panscan <-1.0 - 1.0> | <0.0 - 1.0> <abs>
     Increase or decrease the pan-and-scan range by <value>, 1.0 is the maximum.
     Negative values decrease the pan-and-scan range.
Index: DOCS/man/en/mplayer.1
===================================================================
--- DOCS/man/en/mplayer.1	(revision 27232)
+++ DOCS/man/en/mplayer.1	(working copy)
@@ -6992,6 +6992,35 @@
 .PD 1
 .
 .TP
+.B overlay=shmkey
+Provides an overlay image buffer that can be accessed via shared memory.
+This filter can be used by applications controlling MPlayer to provide a
+custom on-screen display.
+The overlay image is composited over the running video and supports global 
+and per-pixel alpha blending.
+Pixels are specified in BGRA format, and the size of the overlay image is the
+video display size.
+In order to have a specific overlay image size, you can precede this filter
+with the scale, expand, and/or dsize filters.
+.sp 1
+The filter is controlled by the overlay slave command, and it may be used
+even when the video is paused.
+This slave command can be used to update regions of the overlay image, toggle
+visibility, adjust the global alpha level, etc.
+.sp 1
+.PD 0
+.RSs
+.IPs <shmkey>
+an integer that will be used as the key for the SysV shared memory segment
+.RE
+.PD 1
+.sp 1
+.RS
+See DOCS/tech/vf_overlay.txt for more details about how this filter works and
+how to control it with the slave command.
+.REss
+.
+.TP
 .B framestep=I|[i]step
 Renders only every nth frame or every intra frame (keyframe).
 .sp 1
Index: mplayer.c
===================================================================
--- mplayer.c	(revision 27232)
+++ mplayer.c	(working copy)
@@ -1327,6 +1327,30 @@
 }
 
 
+/**
+ * \brief send a VFCTRL_PERIODIC_UPDATE through the filter chain
+ * \param vf first video filter to receive the control
+ * \param vo needed to flip if requested
+ * \param time_avail time available, might decide not to send the 
+ *         control if this is too small
+ * \return 1 if the event was consumed or not sent, 0 otherwise
+ * 
+ * When the filter chain returns CONTROL_TRUE, a flip_page
+ * will be performed, a value < 0 means it was not consumed,
+ * causing less events to be produced in the future
+ */
+static int periodic_update(vf_instance_t *vf, const vo_functions_t *vo,
+                            float time_avail) {
+   int res;
+   if (!vf) return 0;
+   if (time_avail < 0.01) return 1;
+   res = vf->control(vf, VFCTRL_PERIODIC_UPDATE, &time_avail);
+   if (res == CONTROL_OK && vo_config_count)
+     vo->flip_page();
+   return res > 0;
+}
+
+
 typedef struct mp_osd_msg mp_osd_msg_t;
 struct mp_osd_msg {
     /// Previous message on the stack.
@@ -1786,6 +1810,8 @@
 	current_module="sleep_rtc";
         while (time_frame > 0.000) {
 	    unsigned long rtc_ts;
+	    periodic_update(mpctx->sh_video->vfilter, mpctx->video_out, time_frame);
+	    time_frame-=GetRelativeTime();
 	    if (read(rtc_fd, &rtc_ts, sizeof(rtc_ts)) <= 0)
 		mp_msg(MSGT_CPLAYER, MSGL_ERR, MSGTR_LinuxRTCReadError, strerror(errno));
     	    time_frame -= GetRelativeTime();
@@ -1798,8 +1824,10 @@
 	float margin = softsleep ? 0.011 : 0;
 	current_module = "sleep_timer";
 	while (time_frame > margin) {
-	    usec_sleep(1000000 * (time_frame - margin));
+	    int mul = 100 * !periodic_update(mpctx->sh_video->vfilter, mpctx->video_out, time_frame);
 	    time_frame -= GetRelativeTime();
+	    usec_sleep(10000 * mul * (time_frame - margin));
+	    time_frame -= GetRelativeTime();
 	}
 	if (softsleep){
 	    current_module = "sleep_soft";
@@ -2352,7 +2380,7 @@
     if (mpctx->audio_out && mpctx->sh_audio)
 	mpctx->audio_out->pause();	// pause audio, keep data if possible
 
-    while ( (cmd = mp_input_get_cmd(20, 1, 1)) == NULL
+    while ( (cmd = mp_input_get_cmd(3, 1, 1)) == NULL
             || cmd->id == MP_CMD_SET_MOUSE_POS) {
 	if (cmd) {
 	  cmd = mp_input_get_cmd(0,1,0);
@@ -2373,6 +2401,9 @@
 	if (vf_menu)
 	    vf_menu_pause_update(vf_menu);
 #endif
+	if (mpctx->sh_video && periodic_update(mpctx->sh_video->vfilter, mpctx->video_out, 1))
+	    usec_sleep(1000);
+	else
 	usec_sleep(20000);
     }
     if (cmd && cmd->id == MP_CMD_PAUSE) {
Index: mencoder.c
===================================================================
--- mencoder.c	(revision 27232)
+++ mencoder.c	(working copy)
@@ -231,6 +231,11 @@
 char *current_module;
 #include "mpcommon.h"
 
+// Stubs for filters that might call these functions, to satisfy linker.  These
+// filters are probably only useful for mplayer.
+void mp_input_add_cmd_filter(void *func, void* ctx) {}
+void *mp_input_get_cmd(int time, int paused, int peek_only) { return NULL; }
+
 //char *out_audio_codec=NULL; // override audio codec
 //char *out_video_codec=NULL; // override video codec
 
