EFMSA Intranet — Email Ingest & AI Search (PHP)

Overview

EFMSA ingests emails and uploads, paraphrases them, and indexes content for instant search. The system is PHP-first, using cron and MySQL FULLTEXT.

Extraction — PHP

<?php
require __DIR__ . '/vendor/autoload.php';
use PhpOffice\PhpWord\IOFactory as WordIO;
use League\CommonMark\CommonMarkConverter;
use Smalot\PdfParser\Parser as PdfParser;

function extract_text(string $path): array {
  $ext = strtolower(pathinfo($path, PATHINFO_EXTENSION));
  if ($ext === 'docx') {
    $phpWord = WordIO::load($path);
    $text = '';
    foreach ($phpWord->getSections() as $section) {
      foreach ($section->getElements() as $el) {
        if (method_exists($el, 'getElements')) {
          foreach ($el->getElements() as $inner) {
            if (method_exists($inner, 'getText')) $text .= $inner->getText() . "\n";
          }
        } elseif (method_exists($el, 'getText')) {
          $text .= $el->getText() . "\n";
        }
      }
    }
    return ['mime' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'text' => trim($text)];
  }
  if ($ext === 'md') {
    $md = file_get_contents($path);
    $converter = new CommonMarkConverter();
    $html = $converter->convert($md)->getContent();
    return ['mime' => 'text/markdown', 'text' => trim(strip_tags($html))];
  }
  if ($ext === 'pdf') {
    $parser = new PdfParser();
    $pdf = $parser->parseFile($path);
    return ['mime' => 'application/pdf', 'text' => trim($pdf->getText())];
  }
  $raw = file_get_contents($path);
  $txt = $ext === 'txt' ? $raw : strip_tags($raw);
  return ['mime' => 'text/plain', 'text' => trim($txt)];
}