Source Code for /public/inc/searchindex.php

<?php
/**
 * This file produces a LunrJS-compatible search index covering the entire website.
 */

// If we've pre-cached a search index recently (<24 hours old), use that:
define('SEARCH_INDEX_CACHE_TIME', 1 /* (temporarily 1 second, was:) 60 * 60 * 24 */);

/**
 * Returns the JSON used for the LunrJS search index, updating the cached copy if necessary.
 */
function search_index_json() {
  // If there's a (recent) cached copy, return that:
  if( file_exists('searchindex.json') && filemtime('searchindex.json') > time() - SEARCH_INDEX_CACHE_TIME ) {
    return file_get_contents('searchindex.json');
  }

  $results = [];
  chdir(__DIR__ . '/../');
  $files = array_merge(
    glob('*.php'),
    glob('**/*.php'),
    glob('**/**/*.php'),
  );

  function extract_code_contents($file) {
    $contents = file_get_contents($file);
    return [ "title" => basename($file), "body" => trim($contents) ];
  }
  
  foreach ($files as $page) {
    if ( str_starts_with($page, 'inc/') || str_starts_with($page, 'toc/') || str_starts_with($page, '404.php') ||
         str_starts_with($page, 'demo-harness.php') || str_starts_with($page, 'search/') || str_starts_with($page, 'source-viewer.php')
       ) {
      continue; // don't index the inc/ directory or the ToC
    } else if ( str_ends_with($page, 'index.php') ) {
      $results[] = array_merge([
        "id" => "/" . str_replace('index.php', '', $page),
        "kind" => "page",
      ], extract_page_contents($page));
    } else {
      $results[] = array_merge([
        "id" => "/" . $page,
        "kind" => "code",
      ], extract_code_contents($page));
    }
  }
  
  // Write the search index to a file, so we can use it next time:
  file_put_contents('searchindex.json', json_encode($results, JSON_PRETTY_PRINT));
  
  // Output the search index to the browser:
  return json_encode($results, JSON_PRETTY_PRINT);
}
  
function extract_page_contents($file) {
  $contents = file_get_contents($file);

  // See if we can find an add_header(...) call in the contents; that's the title!
  $title_matches = [];
  $title_regex = <<<'REGEX'
    /add_header\(\s*(["'])(.+?)\1/
  REGEX;
  if ( preg_match($title_regex, $contents, $title_matches) ) {
    // Don't forget to remove the quote marks, which could be of either type:
    $title = strip_tags($title_matches[2]);
  } else {
    $title = 'Home';
  }

  // Strip anything that looks like a PHP block:
  $contents = preg_replace('/<\\?php.*?\\?>/s', '', $contents);
  // Strip tags:
  $contents = strip_tags($contents);
  // Trim whitespace:
  $contents = trim( preg_replace('/[ \t]*[\r\n]+[ \t]*/', "\n", preg_replace('/[ \t]+/', ' ', $contents) ) );
  // Anything that remains that WAS escaped... can now be treated as searchable text content, so unescape it:
  $contents = html_entity_decode($contents);
  // Remove any HTML tags that are left:
  $contents = strip_tags($contents);

  return [ "title" => $title, "body" => $contents ];
}

// If this file is accessed directly, output the search index JSON:
if( __FILE__ == $_SERVER['SCRIPT_FILENAME'] ) {
  header('Content-Type: application/json');
  echo search_index_json();
}