Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
Size: Mime:
/*-------------------------------------------------------------------------
 *
 * aio_internal.h
 *    AIO related declarations that should only be used by the AIO subsystem
 *    internally.
 *
 *
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * src/include/storage/aio_internal.h
 *
 *-------------------------------------------------------------------------
 */
#ifndef AIO_INTERNAL_H
#define AIO_INTERNAL_H


#include "lib/ilist.h"
#include "port/pg_iovec.h"
#include "storage/aio.h"
#include "storage/condition_variable.h"


/*
 * The maximum number of IOs that can be batch submitted at once.
 */
#define PGAIO_SUBMIT_BATCH_SIZE 32



/*
 * State machine for handles. With some exceptions, noted below, handles move
 * linearly through all states.
 *
 * State changes should all go through pgaio_io_update_state().
 *
 * Note that the externally visible functions to start IO
 * (e.g. FileStartReadV(), via pgaio_io_start_readv()) move an IO from
 * PGAIO_HS_HANDED_OUT to at least PGAIO_HS_STAGED and at most
 * PGAIO_HS_COMPLETED_LOCAL (at which point the handle will be reused).
 */
typedef enum PgAioHandleState
{
	/* not in use */
	PGAIO_HS_IDLE = 0,

	/*
	 * Returned by pgaio_io_acquire(). The next state is either DEFINED (if
	 * pgaio_io_start_*() is called), or IDLE (if pgaio_io_release() is
	 * called).
	 */
	PGAIO_HS_HANDED_OUT,

	/*
	 * pgaio_io_start_*() has been called, but IO is not yet staged. At this
	 * point the handle has all the information for the IO to be executed.
	 */
	PGAIO_HS_DEFINED,

	/*
	 * stage() callbacks have been called, handle ready to be submitted for
	 * execution. Unless in batchmode (see c.f. pgaio_enter_batchmode()), the
	 * IO will be submitted immediately after.
	 */
	PGAIO_HS_STAGED,

	/* IO has been submitted to the IO method for execution */
	PGAIO_HS_SUBMITTED,

	/* IO finished, but result has not yet been processed */
	PGAIO_HS_COMPLETED_IO,

	/*
	 * IO completed, shared completion has been called.
	 *
	 * If the IO completion occurs in the issuing backend, local callbacks
	 * will immediately be called. Otherwise the handle stays in
	 * COMPLETED_SHARED until the issuing backend waits for the completion of
	 * the IO.
	 */
	PGAIO_HS_COMPLETED_SHARED,

	/*
	 * IO completed, local completion has been called.
	 *
	 * After this the handle will be made reusable and go into IDLE state.
	 */
	PGAIO_HS_COMPLETED_LOCAL,
} PgAioHandleState;


struct ResourceOwnerData;

/*
 * Typedef is in aio_types.h
 *
 * We don't use the underlying enums for state, target and op to avoid wasting
 * space. We tried using bitfields, but several compilers generate rather
 * horrid code for that.
 */
struct PgAioHandle
{
	/* all state updates should go through pgaio_io_update_state() */
	uint8		state;

	/* what are we operating on */
	uint8		target;

	/* which IO operation */
	uint8		op;

	/* bitfield of PgAioHandleFlags */
	uint8		flags;

	uint8		num_callbacks;

	/* using the proper type here would use more space */
	uint8		callbacks[PGAIO_HANDLE_MAX_CALLBACKS];

	/* data forwarded to each callback */
	uint8		callbacks_data[PGAIO_HANDLE_MAX_CALLBACKS];

	/*
	 * Length of data associated with handle using
	 * pgaio_io_set_handle_data_*().
	 */
	uint8		handle_data_len;

	/* XXX: could be optimized out with some pointer math */
	int32		owner_procno;

	/* raw result of the IO operation */
	int32		result;

	/**
	 * In which list the handle is registered, depends on the state:
	 * - IDLE, in per-backend list
	 * - HANDED_OUT - not in a list
	 * - DEFINED - not in a list
	 * - STAGED - in per-backend staged array
	 * - SUBMITTED - in issuer's in_flight list
	 * - COMPLETED_IO - in issuer's in_flight list
	 * - COMPLETED_SHARED - in issuer's in_flight list
	 **/
	dlist_node	node;

	struct ResourceOwnerData *resowner;
	dlist_node	resowner_node;

	/* incremented every time the IO handle is reused */
	uint64		generation;

	/*
	 * To wait for the IO to complete other backends can wait on this CV. Note
	 * that, if in SUBMITTED state, a waiter first needs to check if it needs
	 * to do work via IoMethodOps->wait_one().
	 */
	ConditionVariable cv;

	/* result of shared callback, passed to issuer callback */
	PgAioResult distilled_result;

	/*
	 * Index into PgAioCtl->iovecs and PgAioCtl->handle_data.
	 *
	 * At the moment there's no need to differentiate between the two, but
	 * that won't necessarily stay that way.
	 */
	uint32		iovec_off;

	/*
	 * If not NULL, this memory location will be updated with information
	 * about the IOs completion iff the issuing backend learns about the IOs
	 * completion.
	 */
	PgAioReturn *report_return;

	/* Data necessary for the IO to be performed */
	PgAioOpData op_data;

	/*
	 * Data necessary to identify the object undergoing IO to higher-level
	 * code. Needs to be sufficient to allow another backend to reopen the
	 * file.
	 */
	PgAioTargetData target_data;
};


typedef struct PgAioBackend
{
	/* index into PgAioCtl->io_handles */
	uint32		io_handle_off;

	/* IO Handles that currently are not used */
	dclist_head idle_ios;

	/*
	 * Only one IO may be returned by pgaio_io_acquire()/pgaio_io_acquire_nb()
	 * without having been either defined (by actually associating it with IO)
	 * or released (with pgaio_io_release()). This restriction is necessary to
	 * guarantee that we always can acquire an IO. ->handed_out_io is used to
	 * enforce that rule.
	 */
	PgAioHandle *handed_out_io;

	/* Are we currently in batchmode? See pgaio_enter_batchmode(). */
	bool		in_batchmode;

	/*
	 * IOs that are defined, but not yet submitted.
	 */
	uint16		num_staged_ios;
	PgAioHandle *staged_ios[PGAIO_SUBMIT_BATCH_SIZE];

	/*
	 * List of in-flight IOs. Also contains IOs that aren't strictly speaking
	 * in-flight anymore, but have been waited-for and completed by another
	 * backend. Once this backend sees such an IO it'll be reclaimed.
	 *
	 * The list is ordered by submission time, with more recently submitted
	 * IOs being appended at the end.
	 */
	dclist_head in_flight_ios;
} PgAioBackend;


typedef struct PgAioCtl
{
	int			backend_state_count;
	PgAioBackend *backend_state;

	/*
	 * Array of iovec structs. Each iovec is owned by a specific backend. The
	 * allocation is in PgAioCtl to allow the maximum number of iovecs for
	 * individual IOs to be configurable with PGC_POSTMASTER GUC.
	 */
	uint32		iovec_count;
	struct iovec *iovecs;

	/*
	 * For, e.g., an IO covering multiple buffers in shared / temp buffers, we
	 * need to get Buffer IDs during completion to be able to change the
	 * BufferDesc state accordingly. This space can be used to store e.g.
	 * Buffer IDs.  Note that the actual iovec might be shorter than this,
	 * because we combine neighboring pages into one larger iovec entry.
	 */
	uint64	   *handle_data;

	uint32		io_handle_count;
	PgAioHandle *io_handles;
} PgAioCtl;



/*
 * Callbacks used to implement an IO method.
 */
typedef struct IoMethodOps
{
	/* properties */

	/*
	 * If an FD is about to be closed, do we need to wait for all in-flight
	 * IOs referencing that FD?
	 */
	bool		wait_on_fd_before_close;


	/* global initialization */

	/*
	 * Amount of additional shared memory to reserve for the io_method. Called
	 * just like a normal ipci.c style *Size() function. Optional.
	 */
	size_t		(*shmem_size) (void);

	/*
	 * Initialize shared memory. First time is true if AIO's shared memory was
	 * just initialized, false otherwise. Optional.
	 */
	void		(*shmem_init) (bool first_time);

	/*
	 * Per-backend initialization. Optional.
	 */
	void		(*init_backend) (void);


	/* handling of IOs */

	/* optional */
	bool		(*needs_synchronous_execution) (PgAioHandle *ioh);

	/*
	 * Start executing passed in IOs.
	 *
	 * Shall advance state to at least PGAIO_HS_SUBMITTED.  (By the time this
	 * returns, other backends might have advanced the state further.)
	 *
	 * Will not be called if ->needs_synchronous_execution() returned true.
	 *
	 * num_staged_ios is <= PGAIO_SUBMIT_BATCH_SIZE.
	 *
	 * Always called in a critical section.
	 */
	int			(*submit) (uint16 num_staged_ios, PgAioHandle **staged_ios);

	/* ---
	 * Wait for the IO to complete. Optional.
	 *
	 * On return, state shall be on of
	 * - PGAIO_HS_COMPLETED_IO
	 * - PGAIO_HS_COMPLETED_SHARED
	 * - PGAIO_HS_COMPLETED_LOCAL
	 *
	 * The callback must not block if the handle is already in one of those
	 * states, or has been reused (see pgaio_io_was_recycled()).  If, on
	 * return, the state is PGAIO_HS_COMPLETED_IO, state will reach
	 * PGAIO_HS_COMPLETED_SHARED without further intervention by the IO
	 * method.
	 *
	 * If not provided, it needs to be guaranteed that the IO method calls
	 * pgaio_io_process_completion() without further interaction by the
	 * issuing backend.
	 * ---
	 */
	void		(*wait_one) (PgAioHandle *ioh,
							 uint64 ref_generation);
} IoMethodOps;


/* aio.c */
extern bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state);
extern void pgaio_io_stage(PgAioHandle *ioh, PgAioOp op);
extern void pgaio_io_process_completion(PgAioHandle *ioh, int result);
extern void pgaio_io_prepare_submit(PgAioHandle *ioh);
extern bool pgaio_io_needs_synchronous_execution(PgAioHandle *ioh);
extern const char *pgaio_io_get_state_name(PgAioHandle *ioh);
const char *pgaio_result_status_string(PgAioResultStatus rs);
extern void pgaio_shutdown(int code, Datum arg);

/* aio_callback.c */
extern void pgaio_io_call_stage(PgAioHandle *ioh);
extern void pgaio_io_call_complete_shared(PgAioHandle *ioh);
extern PgAioResult pgaio_io_call_complete_local(PgAioHandle *ioh);

/* aio_io.c */
extern void pgaio_io_perform_synchronously(PgAioHandle *ioh);
extern const char *pgaio_io_get_op_name(PgAioHandle *ioh);
extern bool pgaio_io_uses_fd(PgAioHandle *ioh, int fd);
extern int	pgaio_io_get_iovec_length(PgAioHandle *ioh, struct iovec **iov);

/* aio_target.c */
extern bool pgaio_io_can_reopen(PgAioHandle *ioh);
extern void pgaio_io_reopen(PgAioHandle *ioh);
extern const char *pgaio_io_get_target_name(PgAioHandle *ioh);


/*
 * The AIO subsystem has fairly verbose debug logging support. This can be
 * enabled/disabled at build time. The reason for this is that
 * a) the verbosity can make debugging things on higher levels hard
 * b) even if logging can be skipped due to elevel checks, it still causes a
 *    measurable slowdown
 *
 * XXX: This likely should be eventually be disabled by default, at least in
 * non-assert builds.
 */
#define PGAIO_VERBOSE		1

/*
 * Simple ereport() wrapper that only logs if PGAIO_VERBOSE is defined.
 *
 * This intentionally still compiles the code, guarded by a constant if (0),
 * if verbose logging is disabled, to make it less likely that debug logging
 * is silently broken.
 *
 * The current definition requires passing at least one argument.
 */
#define pgaio_debug(elevel, msg, ...)  \
	do { \
		if (PGAIO_VERBOSE) \
			ereport(elevel, \
					errhidestmt(true), errhidecontext(true), \
					errmsg_internal(msg, \
									__VA_ARGS__)); \
	} while(0)

/*
 * Simple ereport() wrapper. Note that the definition requires passing at
 * least one argument.
 */
#define pgaio_debug_io(elevel, ioh, msg, ...)  \
	pgaio_debug(elevel, "io %-10d|op %-5s|target %-4s|state %-16s: " msg, \
				pgaio_io_get_id(ioh), \
				pgaio_io_get_op_name(ioh), \
				pgaio_io_get_target_name(ioh), \
				pgaio_io_get_state_name(ioh), \
				__VA_ARGS__)

/* Declarations for the tables of function pointers exposed by each IO method. */
extern PGDLLIMPORT const IoMethodOps pgaio_sync_ops;
extern PGDLLIMPORT const IoMethodOps pgaio_worker_ops;
#ifdef IOMETHOD_IO_URING_ENABLED
extern PGDLLIMPORT const IoMethodOps pgaio_uring_ops;
#endif

extern PGDLLIMPORT const IoMethodOps *pgaio_method_ops;
extern PGDLLIMPORT PgAioCtl *pgaio_ctl;
extern PGDLLIMPORT PgAioBackend *pgaio_my_backend;



#endif							/* AIO_INTERNAL_H */